이 함수 내의 계산을 보면
Q = randomized_range_finder(X,size = n_random, n_iter = n_iter, power_iteration_nomalizer = power_iteration_normalizer, random_state = random_state)
## Q = X의 직교 행렬
B = safe_sparse_dot(Q.T, X)
## B = X의 직교행렬의 transpose * X
Uhat, s, Vt = linalg.svd(B, full_matrices=False)
######
Uhat : Unitary array(s). The first a.ndim - 2 dimensions have the same size as those of the input B
s : Vector(s) with the singular values, within each vector sorted in descending order. The first a.ndim - 2 dimensions have the same size as those of the input B
Vt : Unitary array(s). The first a.ndim - 2 dimensions have the same size as those of the input B
B = (Uhat * s) @ Vt
#######
U = np.dot(Q, Uhat)
## U = Q * Uhat
U, Vt = svd_flip(U, Vt)
#######
svd_flip : Sign correction to ensure deterministic output from SVD.
Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.
#######
return U[:, :n_components], s[:n_components], Vt[:n_components, :]
fit_transform(X)
U, S, Vt = self._fit(X)
U = U[:, : self.n_components_]
U *= S[: self.n_components_]
return U
[출처]
https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html
https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_range_finder.html
https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html
https://www.kite.com/python/docs/sklearn.utils.extmath.svd_flip
===============================================================================
def fit(self,X,y=None):
class PCA(_BasePCA):
def __init__( | |
self, | |
n_components=None, | |
*, | |
copy=True, | |
whiten=False, | |
svd_solver="auto", | |
tol=0.0, | |
iterated_power="auto", | |
n_oversamples=10, | |
random_state=None, |
PCA(n_components = 2).fit(X) 중
self.n_components=2 로 객체 생성
def fit(self, X, y=None): | |
check_scalar( | |
self.n_oversamples, | |
"n_oversamples", | |
min_val=1, | |
target_type=numbers.Integral, | |
) | |
self._fit(X) | |
return self |
sklearn.utils.check_scalar(x, name, target_type, *, min_val=None, max_val=None):스칼라 매개 변수 유형 및 값의 유효성을 검사합니다.
n_oversamples의 파라미터를 지정 안했으니, self.n_oversamples=10이고 oversamples가 str값으로 n_oversamples이고 최솟값 1을 가지며 데이터 타입이 numbers.Integral이 아니면 에러 발생시킴
fit을 하니 _fit로 넘어감
def _fit(self, X): | |
"""Dispatch to the right submethod depending on the chosen solver.""" | |
# Raise an error for sparse input. | |
# This is more informative than the generic one raised by check_array. | |
if issparse(X): | |
raise TypeError( | |
"PCA does not support sparse input. See " | |
"TruncatedSVD for a possible alternative." | |
) | |
X = self._validate_data( | |
X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy | |
) | |
# Handle n_components==None | |
if self.n_components is None: | |
if self.svd_solver != "arpack": | |
n_components = min(X.shape) | |
else: | |
n_components = min(X.shape) - 1 | |
else: | |
n_components = self.n_components | |
# Handle svd_solver | |
self._fit_svd_solver = self.svd_solver | |
if self._fit_svd_solver == "auto": | |
# Small problem or n_components == 'mle', just call full PCA | |
if max(X.shape) <= 500 or n_components == "mle": | |
self._fit_svd_solver = "full" | |
elif n_components >= 1 and n_components < 0.8 * min(X.shape): | |
self._fit_svd_solver = "randomized" | |
# This is also the case of n_components in (0,1) | |
else: | |
self._fit_svd_solver = "full" | |
# Call different fits for either full or truncated SVD | |
if self._fit_svd_solver == "full": | |
return self._fit_full(X, n_components) | |
elif self._fit_svd_solver in ["arpack", "randomized"]: | |
return self._fit_truncated(X, n_components, self._fit_svd_solver) | |
else: | |
raise ValueError( | |
"Unrecognized svd_solver='{0}'".format(self._fit_svd_solver) | |
) |
svd는 특이값 분해. 고윳값 분해는 정방행렬만 다뤘다면 특이값 분해는 "정방"이라는 제약조건에서 일반화시킨거라 생각하면 된다.
0.8 * min(X.shape) = 4.8이고 self.n_components = 2이므로 elif n_components >= 1 and n_components < 0.8 * min(X.shape):를 따른다.
따라서 self._fit_svd_solver = "randomized"
self._fit_truncated(X, n_components, self._fit_svd_solver)를 조사하면 된다.
[출처] 특이값 분해 : https://darkpgmr.tistory.com/106
def _fit_truncated(self, X, n_components, svd_solver): | |
n_samples, n_features = X.shape | |
if isinstance(n_components, str): | |
raise ValueError( | |
"n_components=%r cannot be a string with svd_solver='%s'" | |
% (n_components, svd_solver) | |
) | |
elif not 1 <= n_components <= min(n_samples, n_features): | |
raise ValueError( | |
"n_components=%r must be between 1 and " | |
"min(n_samples, n_features)=%r with " | |
"svd_solver='%s'" | |
% (n_components, min(n_samples, n_features), svd_solver) | |
) | |
elif not isinstance(n_components, numbers.Integral): | |
raise ValueError( | |
"n_components=%r must be of type int " | |
"when greater than or equal to 1, was of type=%r" | |
% (n_components, type(n_components)) | |
) | |
elif svd_solver == "arpack" and n_components == min(n_samples, n_features): | |
raise ValueError( | |
"n_components=%r must be strictly less than " | |
"min(n_samples, n_features)=%r with " | |
"svd_solver='%s'" | |
% (n_components, min(n_samples, n_features), svd_solver) | |
) | |
random_state = check_random_state(self.random_state) | |
# Center data | |
self.mean_ = np.mean(X, axis=0) | |
X -= self.mean_ | |
if svd_solver == "arpack": | |
v0 = _init_arpack_v0(min(X.shape), random_state) | |
U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0) | |
# svds doesn't abide by scipy.linalg.svd/randomized_svd | |
# conventions, so reverse its outputs. | |
S = S[::-1] | |
# flip eigenvectors' sign to enforce deterministic output | |
U, Vt = svd_flip(U[:, ::-1], Vt[::-1]) | |
elif svd_solver == "randomized": | |
# sign flipping is done inside | |
U, S, Vt = randomized_svd( | |
X, | |
n_components=n_components, | |
n_oversamples=self.n_oversamples, | |
n_iter=self.iterated_power, | |
flip_sign=True, | |
random_state=random_state, | |
) | |
self.n_samples_, self.n_features_ = n_samples, n_features | |
self.components_ = Vt | |
self.n_components_ = n_components | |
# Get variance explained by singular values | |
self.explained_variance_ = (S ** 2) / (n_samples - 1) | |
total_var = np.var(X, ddof=1, axis=0) | |
self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum() | |
self.singular_values_ = S.copy() # Store the singular values. | |
if self.n_components_ < min(n_features, n_samples): | |
self.noise_variance_ = total_var.sum() - self.explained_variance_.sum() | |
self.noise_variance_ /= min(n_features, n_samples) - n_components | |
else: | |
self.noise_variance_ = 0.0 | |
return U, S, Vt |
randomized_svd는 파라미터 값을 넣으면 특이값 분해 를 계산해준다.
함수 정의에서 n_components 는 추출될 값이나 벡터 개수
[출처]
https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html
def randomized_svd( | |
M, | |
n_components, | |
*, | |
n_oversamples=10, | |
n_iter="auto", | |
power_iteration_normalizer="auto", | |
transpose="auto", | |
flip_sign=True, | |
random_state="warn", | |
): | |
if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)): | |
warnings.warn( | |
"Calculating SVD of a {} is expensive. " | |
"csr_matrix is more efficient.".format(type(M).__name__), | |
sparse.SparseEfficiencyWarning, | |
) | |
if random_state == "warn": | |
warnings.warn( | |
"If 'random_state' is not supplied, the current default " | |
"is to use 0 as a fixed seed. This will change to " | |
"None in version 1.2 leading to non-deterministic results " | |
"that better reflect nature of the randomized_svd solver. " | |
"If you want to silence this warning, set 'random_state' " | |
"to an integer seed or to None explicitly depending " | |
"if you want your code to be deterministic or not.", | |
FutureWarning, | |
) | |
random_state = 0 | |
random_state = check_random_state(random_state) | |
n_random = n_components + n_oversamples | |
n_samples, n_features = M.shape | |
if n_iter == "auto": | |
# Checks if the number of iterations is explicitly specified | |
# Adjust n_iter. 7 was found a good compromise for PCA. See #5299 | |
n_iter = 7 if n_components < 0.1 * min(M.shape) else 4 | |
if transpose == "auto": | |
transpose = n_samples < n_features | |
if transpose: | |
# this implementation is a bit faster with smaller shape[1] | |
M = M.T | |
Q = randomized_range_finder( | |
M, | |
size=n_random, | |
n_iter=n_iter, | |
power_iteration_normalizer=power_iteration_normalizer, | |
random_state=random_state, | |
) | |
# project M to the (k + p) dimensional space using the basis vectors | |
B = safe_sparse_dot(Q.T, M) | |
# compute the SVD on the thin matrix: (k + p) wide | |
Uhat, s, Vt = linalg.svd(B, full_matrices=False) | |
del B | |
U = np.dot(Q, Uhat) | |
if flip_sign: | |
if not transpose: | |
U, Vt = svd_flip(U, Vt) | |
else: | |
# In case of transpose u_based_decision=false | |
# to actually flip based on u and not v. | |
U, Vt = svd_flip(U, Vt, u_based_decision=False) | |
if transpose: | |
# transpose back the results according to the input convention | |
return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T | |
else: | |
return U[:, :n_components], s[:n_components], Vt[:n_components, :] | |
randomized_range_finder :Computes an orthonormal matrix whose range approximates the range of A
safe_sparse_dot : Dot product that handle the sparse matrix case correctly.
linalg.svd: When a is a 2D array, it is factorized as u @ np.diag(s) @ vh = (u * s) @ vh, where u and vhare 2D unitary arrays and s is a 1D array of a’s singular values. When a is higher-dimensional, SVD is applied in stacked mode as explained below.
svd_flip : Sign correction to ensure deterministic output from SVD.
Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.
https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_range_finder.html
https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html
https://www.kite.com/python/docs/sklearn.utils.extmath.svd_flip
def fit_transform(self, X, y=None): | |
U, S, Vt = self._fit(X) | |
U = U[:, : self.n_components_] | |
if self.whiten: | |
# X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples) | |
U *= sqrt(X.shape[0] - 1) | |
else: | |
# X_new = X * V = U * S * Vt * V = U * S | |
U *= S[: self.n_components_] | |
return U |