% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/slope.R
\name{SLOPE}
\alias{SLOPE}
\title{Sorted L-One Penalized Estimation}
\usage{
SLOPE(
  x,
  y,
  family = c("gaussian", "binomial", "multinomial", "poisson"),
  intercept = TRUE,
  center = c("mean", "min", "none"),
  scale = c("sd", "l1", "l2", "max_abs", "none"),
  alpha = c("path", "estimate"),
  lambda = c("bh", "gaussian", "oscar", "lasso"),
  alpha_min_ratio = if (NROW(x) < NCOL(x)) 0.01 else 1e-04,
  path_length = 100,
  q = 0.1,
  theta1 = 1,
  theta2 = 0.5,
  tol_dev_change = 1e-05,
  tol_dev_ratio = 0.999,
  max_variables = NROW(x) + 1,
  solver = c("auto", "hybrid", "pgd", "fista", "admm"),
  max_passes = 1e+06,
  tol = 1e-04,
  threads = NULL,
  diagnostics = FALSE,
  patterns = FALSE,
  gamma = 1,
  cd_type = c("permuted", "cyclical"),
  tol_abs,
  tol_rel,
  tol_rel_gap,
  tol_infeas,
  tol_rel_coef_change,
  prox_method,
  screen,
  verbosity,
  screen_alg
)
}
\arguments{
\item{x}{the design matrix, which can be either a dense
matrix of the standard \emph{matrix} class, or a sparse matrix
inheriting from \link[Matrix:sparseMatrix]{Matrix::sparseMatrix}. Data frames will
be converted to matrices internally.}

\item{y}{the response, which for \code{family = "gaussian"} must be numeric; for
\code{family = "binomial"} or \code{family = "multinomial"}, it can be a factor.}

\item{family}{model family (objective); see \strong{Families} for details.}

\item{intercept}{whether to fit an intercept}

\item{center}{whether to center predictors or not by their mean. Defaults
to \code{TRUE} if \code{x} is dense and \code{FALSE} otherwise.}

\item{scale}{type of scaling to apply to predictors.
\itemize{
\item \code{"l1"} scales predictors to have L1 norms of one.
\item \code{"l2"} scales predictors to have L2 norms of one.#'
\item \code{"sd"} scales predictors to have a population standard deviation one.
\item \code{"none"} applies no scaling.
}}

\item{alpha}{scale for regularization path: either a decreasing numeric
vector (possibly of length 1) or a character vector; in the latter case,
the choices are:
\itemize{
\item \code{"path"}, which computes a regularization sequence
where the first value corresponds to the intercept-only (null) model and
the last to the almost-saturated model, and
\item \code{"estimate"}, which estimates a \emph{single} \code{alpha}
using Algorithm 5 in Bogdan et al. (2015).
}

When a value is manually entered for \code{alpha}, it will be scaled based
on the type of standardization that is applied to \code{x}. For \code{scale = "l2"},
\code{alpha} will be scaled by \eqn{\sqrt n}. For \code{scale = "sd"} or \code{"none"},
alpha will be scaled by \eqn{n}, and for \code{scale = "l1"} no scaling is
applied. Note, however, that the \code{alpha} that is returned in the
resulting value is the \strong{unstandardized} alpha.}

\item{lambda}{either a character vector indicating the method used
to construct the lambda path or a numeric non-decreasing
vector with length equal to the number
of coefficients in the model; see section \strong{Regularization sequences}
for details.}

\item{alpha_min_ratio}{smallest value for \code{lambda} as a fraction of
\code{lambda_max}; used in the selection of \code{alpha} when \code{alpha = "path"}.}

\item{path_length}{length of regularization path; note that the path
returned may still be shorter due to the early termination criteria
given by \code{tol_dev_change}, \code{tol_dev_ratio}, and \code{max_variables}.}

\item{q}{parameter controlling the shape of the lambda sequence, with
usage varying depending on the type of path used and has no effect
is a custom \code{lambda} sequence is used. Must be greater than \code{1e-6} and
smaller than 1.}

\item{theta1}{parameter controlling the shape of the lambda sequence
when \code{lambda == "OSCAR"}. This parameter basically sets the intercept
for the lambda sequence and is equivalent to \eqn{\lambda_1} in the
original OSCAR formulation.}

\item{theta2}{parameter controlling the shape of the lambda sequence
when \code{lambda == "OSCAR"}. This parameter basically sets the slope
for the lambda sequence and is equivalent to \eqn{\lambda_2} in the
original OSCAR formulation.}

\item{tol_dev_change}{the regularization path is stopped if the
fractional change in deviance falls below this value; note that this is
automatically set to 0 if a alpha is manually entered}

\item{tol_dev_ratio}{the regularization path is stopped if the
deviance ratio \eqn{1 - \mathrm{deviance}/\mathrm{(null-deviance)}
  }{1 - deviance/(null deviance)} is above this threshold}

\item{max_variables}{criterion for stopping the path in terms of the
maximum number of unique, nonzero coefficients in absolute value in model.
For the multinomial family, this value will be multiplied internally with
the number of levels of the response minus one.}

\item{solver}{type of solver use, either \code{"auto"}, \code{"hybrid"}, \code{"pgd"}, or
\code{"fista"}; \code{"auto"} means that the solver is automatically selected,
which currently means that \code{"hybrid"} is used for all objectives
except multinomial ones, in which case FISTA (\code{"fista"}) is used.}

\item{max_passes}{maximum number of passes (outer iterations) for solver}

\item{tol}{stopping criterion for the solvers in terms of the relative
duality gap}

\item{threads}{number of threads to use in the solver; if \code{NULL}, half
of the available (logical) threads will be used}

\item{diagnostics}{whether to save diagnostics from the solver
(timings and other values depending on type of solver)}

\item{patterns}{whether to return the SLOPE pattern
(cluster, ordering, and sign information) as a list of sparse
matrices, one for each step on the path.}

\item{gamma}{relaxation mixing parameter, between 0 and 1. Has no effect if
set to 0. If larger than 0, the solver will mix the coefficients
from the ordinary SLOPE solutions with the coefficients from the
relaxed solutions (fitting OLS on the SLOPE pattern).}

\item{cd_type}{Type of coordinate descent to use, either \code{"cyclical"} or
\code{"permuted"}. The former means that the cluster are cycled through
in descending order of their coefficients' magnitudes, while the
latter means that the clusters are permuted randomly for each pass.}

\item{tol_abs}{DEPRECATED}

\item{tol_rel}{relative DEPRECATED}

\item{tol_rel_gap}{DEPRECATED}

\item{tol_infeas}{DEPRECATED}

\item{tol_rel_coef_change}{DEPRECATED}

\item{prox_method}{DEPRECATED}

\item{screen}{DEPRECATED}

\item{verbosity}{DEPRECATED}

\item{screen_alg}{DEPRECATED}
}
\value{
An object of class \code{"SLOPE"} with the following slots:
\item{coefficients}{
a list of the coefficients from the
model fit, not including the intercept.
The coefficients are stored as sparse matrices.
}
\item{nonzeros}{
a three-dimensional logical array indicating whether a
coefficient was zero or not
}
\item{lambda}{
the lambda vector that when multiplied by a value in \code{alpha}
gives the penalty vector at that point along the regularization
path
}
\item{alpha}{
vector giving the (unstandardized) scaling of the lambda sequence
}
\item{class_names}{
a character vector giving the names of the classes for binomial and
multinomial families
}
\item{passes}{the number of passes the solver took at each step on the path}
\item{deviance_ratio}{
the deviance ratio (as a fraction of 1)
}
\item{null_deviance}{
the deviance of the null (intercept-only) model
}
\item{family}{
the name of the family used in the model fit
}
\item{diagnostics}{
a \code{data.frame} of objective values for the primal and dual problems, as
well as a measure of the infeasibility, time, and iteration; only
available if \code{diagnostics = TRUE} in the call to \code{\link[=SLOPE]{SLOPE()}}.
}
\item{call}{the call used for fitting the model}
}
\description{
Fit a generalized linear model regularized with the
sorted L1 norm, which applies a
non-increasing regularization sequence to the
coefficient vector (\eqn{\beta}) after having sorted it
in decreasing order according  to its absolute values.
}
\details{
\code{SLOPE()} solves the convex minimization problem
\deqn{
  f(\beta) + \alpha \sum_{i=j}^p \lambda_j |\beta|_{(j)},
}{
  f(\beta) + \alpha \sum \lambda_j |\beta|_(j),
}
where \eqn{f(\beta)} is a smooth and convex function and
the second part is the sorted L1-norm.
In ordinary least-squares regression,
\eqn{f(\beta)} is simply the squared norm of the least-squares residuals.
See section \strong{Families} for specifics regarding the various types of
\eqn{f(\beta)} (model families) that are allowed in \code{SLOPE()}.

By default, \code{SLOPE()} fits a path of models, each corresponding to
a separate regularization sequence, starting from
the null (intercept-only) model to an almost completely unregularized
model. These regularization sequences are parameterized using
\eqn{\lambda} and \eqn{\alpha}, with only \eqn{\alpha} varying along the
path. The length of the path can be manually, but will terminate
prematurely depending on
arguments \code{tol_dev_change}, \code{tol_dev_ratio}, and \code{max_variables}.
This means that unless these arguments are modified, the path is not
guaranteed to be of length \code{path_length}.
}
\section{Families}{


\strong{Gaussian}

The Gaussian model (Ordinary Least Squares) minimizes the following
objective:
\deqn{
  \frac{1}{2} \Vert y - X\beta\Vert_2^2
}{
  (1/(2))||y - X \beta||_2^2
}

\strong{Binomial}

The binomial model (logistic regression) has the following objective:
\deqn{
  \sum_{i=1}^n \log\left(1+ \exp\left(
    - y_i \left(x_i^T\beta + \beta_0 \right) \right) \right)
}{
  \sum log(1+ exp(- y_i x_i^T \beta))
}
with \eqn{y \in \{-1, 1\}}{y in {-1, 1}}.

\strong{Poisson}

In poisson regression, we use the following objective:

\deqn{
  -\sum_{i=1}^n \left(y_i\left(
    x_i^T\beta + \beta_0\right) - \exp\left(x_i^T\beta + \beta_0
  \right)\right)
}{
  -\sum (y_i(x_i^T\beta + \beta_0) - exp(x_i^T\beta + \beta_0))
}

\strong{Multinomial}

In multinomial regression, we minimize the full-rank objective
\deqn{
  -\sum_{i=1}^n\left(
    \sum_{k=1}^{m-1} y_{ik}(x_i^T\beta_k + \beta_{0,k})
    - \log\sum_{k=1}^{m-1} \exp\big(x_i^T\beta_k + \beta_{0,k}\big)
  \right)
}{
  -\sum(y_ik(x_i^T\beta_k + \beta_{0,k})
  - log(\sum exp(x_i^T\beta_k + \alpha_{0,k})))
}
with \eqn{y_{ik}} being the element in a \eqn{n} by \eqn{(m-1)} matrix, where
\eqn{m} is the number of classes in the response.
}

\section{Regularization Sequences}{

There are multiple ways of specifying the \code{lambda} sequence
in \code{SLOPE()}. It is, first of all, possible to select the sequence manually
by
using a non-increasing
numeric vector, possibly of length one, as argument instead of a character.
The greater the differences are between
consecutive values along the sequence, the more clustering behavior
will the model exhibit. Note, also, that the scale of the \eqn{\lambda}
vector makes no difference if \code{alpha = NULL}, since \code{alpha} will be
selected automatically to ensure that the model is completely sparse at the
beginning and almost unregularized at the end. If, however, both
\code{alpha} and \code{lambda} are manually specified, then the scales of both do
matter, so make sure to choose them wisely.

Instead of choosing the sequence manually, one of the following
automatically generated sequences may be chosen.

\strong{BH (Benjamini--Hochberg)}

If \code{lambda = "bh"}, the sequence used is that referred to
as \eqn{\lambda^{(\mathrm{BH})}}{\lambda^(BH)} by Bogdan et al, which sets
\eqn{\lambda} according to
\deqn{
  \lambda_i = \Phi^{-1}(1 - iq/(2p)),
}{
  \lambda_i = \Phi^-1(1 - iq/(2p)),
}
for \eqn{i=1,\dots,p}, where \eqn{\Phi^{-1}}{\Phi^-1} is the quantile
function for the standard normal distribution and \eqn{q} is a parameter
that can be set by the user in the call to \code{SLOPE()}.

\strong{Gaussian}

This penalty sequence is related to BH, such that
\deqn{
  \lambda_i = \lambda^{(\mathrm{BH})}_i
  \sqrt{1 + w(i-1)\cdot \mathrm{cumsum}(\lambda^2)_i},
}{
  \lambda_i = \lambda^(BH)_i \sqrt{1 + w(i-1) * cumsum(\lambda^2)_i},
}
for \eqn{i=1,\dots,p}, where \eqn{w(k) = 1/(n-k-1)}. We let
\eqn{\lambda_1 = \lambda^{(\mathrm{BH})}_1}{\lambda_1 = \lambda^(BH)_1} and
adjust the sequence to make sure that it's non-increasing.
Note that if \eqn{p} is large relative
to \eqn{n}, this option will result in a constant sequence, which is
usually not what you would want.

\strong{OSCAR}

This sequence comes from Bondell and Reich and is a linear non-increasing
sequence, such that
\deqn{
  \lambda_i = \theta_1 + (p - i)\theta_2.
}
for \eqn{i = 1,\dots,p}. We use the parametrization from Zhong and Kwok
(2021) but use \eqn{\theta_1} and \eqn{\theta_2} instead of \eqn{\lambda_1}
and \eqn{\lambda_2} to avoid confusion and abuse of notation.

\strong{lasso}

SLOPE is exactly equivalent to the
lasso when the sequence of regularization weights is constant, i.e.
\deqn{
  \lambda_i = 1
}
for \eqn{i = 1,\dots,p}. Here, again, we stress that the fact that
all \eqn{\lambda} are equal to one does not matter as long as
\code{alpha == NULL} since we scale the vector automatically.
Note that this option is only here for academic interest and
to highlight the fact that SLOPE is
a generalization of the lasso. There are more efficient packages, such as
\strong{glmnet} and \strong{biglasso}, for fitting the lasso.
}

\section{Solvers}{


There are currently three solvers available for SLOPE: Hybrid (Beck and
Teboulle 2009), proximal gradient descent (PGD), and FISTA (Beck and
Teboulle, 2009). The hybrid method is the preferred and generally
fastest method and is therefore the default for the Gaussian and
binomial families, but not currently available for multinomial and
disabled for Poisson due to convergence issues.
}

\examples{

# Gaussian response, default lambda sequence
fit <- SLOPE(bodyfat$x, bodyfat$y)

# Multinomial response, custom alpha and lambda
m <- length(unique(wine$y)) - 1
p <- ncol(wine$x)

alpha <- 0.005
lambda <- exp(seq(log(2), log(1.8), length.out = p * m))

fit <- SLOPE(
  wine$x,
  wine$y,
  family = "multinomial",
  lambda = lambda,
  alpha = alpha
)
}
\references{
Bogdan, M., van den Berg, E., Sabatti, C., Su, W., & Candès, E. J. (2015).
SLOPE -- adaptive variable selection via convex optimization. The Annals of
Applied Statistics, 9(3), 1103–1140.

Larsson, J., Klopfenstein, Q., Massias, M., & Wallin, J. (2023). Coordinate
descent for SLOPE. In F. Ruiz, J. Dy, & J.-W. van de Meent (Eds.),
Proceedings of the 26th international conference on artificial
intelligence and statistics (Vol. 206, pp. 4802–4821). PMLR.
https://proceedings.mlr.press/v206/larsson23a.html

Bondell, H. D., & Reich, B. J. (2008). Simultaneous Regression Shrinkage,
Variable Selection, and Supervised Clustering of Predictors with OSCAR.
Biometrics, 64(1), 115–123. JSTOR.

Boyd, S., Parikh, N., Chu, E., Peleato, B., & Eckstein, J. (2010).
Distributed Optimization and Statistical Learning via the Alternating
Direction Method of Multipliers. Foundations and Trends® in Machine Learning,
3(1), 1–122.

Beck, A., & Teboulle, M. (2009). A Fast Iterative Shrinkage-Thresholding
Algorithm for Linear Inverse Problems. SIAM Journal on Imaging Sciences,
2(1), 183–202.
}
\seealso{
\code{\link[=plot.SLOPE]{plot.SLOPE()}}, \code{\link[=plotDiagnostics]{plotDiagnostics()}}, \code{\link[=score]{score()}}, \code{\link[=predict.SLOPE]{predict.SLOPE()}},
\code{\link[=trainSLOPE]{trainSLOPE()}}, \code{\link[=coef.SLOPE]{coef.SLOPE()}}, \code{\link[=print.SLOPE]{print.SLOPE()}}, \code{\link[=print.SLOPE]{print.SLOPE()}},
\code{\link[=deviance.SLOPE]{deviance.SLOPE()}}, \code{\link[=sortedL1Prox]{sortedL1Prox()}}
}
