본문 바로가기

카테고리 없음

AI > Machine Learning < PCA

from sklearn.decomposition import PCA

' X is dataset '
pca = PCA(n_components = 2).fit(X)
X_pca = pca.transform(X)
 
이 두줄 의미 파악하기
 
 
========================================================
 
X = (960,6)이라고 가정하고, 아래의 코드를 참조했을 때 PCA(n_components = 2).fit(X)라는 한 줄은
 
 
randomized_svd(Xn_components=2*n_oversamples=10n_iter='auto'power_iteration_normalizer='auto'transpose='auto'flip_sign=Truerandom_state='warn')를 따른다고 할 수 있다.

이 함수 내의 계산을 보면

 

Q = randomized_range_finder(X,size = n_random, n_iter = n_iter, power_iteration_nomalizer = power_iteration_normalizer, random_state = random_state)

 

## Q = X의 직교 행렬

 

B = safe_sparse_dot(Q.T, X)

 

## B = X의 직교행렬의 transpose * X

 

Uhat, s, Vt = linalg.svd(B, full_matrices=False)

 

######

Uhat : Unitary array(s). The first a.ndim - 2 dimensions have the same size as those of the input B

s : Vector(s) with the singular values, within each vector sorted in descending order. The first a.ndim - 2 dimensions  have the same size as those of the input B

Vt : Unitary array(s). The first a.ndim - 2 dimensions have the same size as those of the input B

B = (Uhat * s) @ Vt

#######

 

 

U = np.dot(QUhat)

 

## U = Q * Uhat

 

UVt = svd_flip(UVt)

 

 

#######

svd_flip : Sign correction to ensure deterministic output from SVD.

Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.

#######

 

 

return U[:, :n_components], s[:n_components], Vt[:n_components, :]

 

 

fit_transform(X)

U, S, Vt = self._fit(X)

 

U = U[:, : self.n_components_]

 

U *= S[: self.n_components_]

 

return U

 

 

 

 

 

[출처]

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_range_finder.html

https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html

https://www.kite.com/python/docs/sklearn.utils.extmath.svd_flip

 

 

===============================================================================

 

def fit(self,X,y=None):

 

 

class PCA(_BasePCA):

 

def __init__(
  self,
  n_components=None,
  *,
  copy=True,
  whiten=False,
  svd_solver="auto",
  tol=0.0,
  iterated_power="auto",
  n_oversamples=10,
  random_state=None,

PCA(n_components = 2).fit(X) 중

self.n_components=2 로 객체 생성

 

 

def fit(self, X, y=None):
   
  check_scalar(
  self.n_oversamples,
  "n_oversamples",
  min_val=1,
  target_type=numbers.Integral,
  )
   
  self._fit(X)
  return self

sklearn.utils.check_scalar(x, name, target_type, *, min_val=None, max_val=None):스칼라 매개 변수 유형 및 값의 유효성을 검사합니다.

n_oversamples의 파라미터를 지정 안했으니, self.n_oversamples=10이고 oversamples가 str값으로 n_oversamples이고 최솟값 1을 가지며 데이터 타입이 numbers.Integral이 아니면 에러 발생시킴

 

fit을 하니 _fit로 넘어감

 

def _fit(self, X):
  """Dispatch to the right submethod depending on the chosen solver."""
   
  # Raise an error for sparse input.
  # This is more informative than the generic one raised by check_array.
  if issparse(X):
  raise TypeError(
  "PCA does not support sparse input. See "
  "TruncatedSVD for a possible alternative."
  )
   
  X = self._validate_data(
  X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy
  )
   
  # Handle n_components==None
  if self.n_components is None:
  if self.svd_solver != "arpack":
  n_components = min(X.shape)
  else:
  n_components = min(X.shape) - 1
  else:
  n_components = self.n_components
   
  # Handle svd_solver
  self._fit_svd_solver = self.svd_solver
  if self._fit_svd_solver == "auto":
  # Small problem or n_components == 'mle', just call full PCA
  if max(X.shape) <= 500 or n_components == "mle":
  self._fit_svd_solver = "full"
  elif n_components >= 1 and n_components < 0.8 * min(X.shape):
  self._fit_svd_solver = "randomized"
  # This is also the case of n_components in (0,1)
  else:
  self._fit_svd_solver = "full"
   
  # Call different fits for either full or truncated SVD
  if self._fit_svd_solver == "full":
  return self._fit_full(X, n_components)
  elif self._fit_svd_solver in ["arpack", "randomized"]:
  return self._fit_truncated(X, n_components, self._fit_svd_solver)
  else:
  raise ValueError(
  "Unrecognized svd_solver='{0}'".format(self._fit_svd_solver)
  )

svd는 특이값 분해. 고윳값 분해는 정방행렬만 다뤘다면 특이값 분해는 "정방"이라는 제약조건에서 일반화시킨거라 생각하면 된다.

 

 0.8 * min(X.shape) = 4.8이고 self.n_components = 2이므로 elif n_components >= 1 and n_components < 0.8 * min(X.shape):를 따른다.

따라서 self._fit_svd_solver = "randomized"

self._fit_truncated(X, n_components, self._fit_svd_solver)를 조사하면 된다.

 

[출처] 특이값 분해 : https://darkpgmr.tistory.com/106

def _fit_truncated(self, X, n_components, svd_solver):
   
   
   
  n_samples, n_features = X.shape
   
  if isinstance(n_components, str):
  raise ValueError(
  "n_components=%r cannot be a string with svd_solver='%s'"
  % (n_components, svd_solver)
  )
  elif not 1 <= n_components <= min(n_samples, n_features):
  raise ValueError(
  "n_components=%r must be between 1 and "
  "min(n_samples, n_features)=%r with "
  "svd_solver='%s'"
  % (n_components, min(n_samples, n_features), svd_solver)
  )
  elif not isinstance(n_components, numbers.Integral):
  raise ValueError(
  "n_components=%r must be of type int "
  "when greater than or equal to 1, was of type=%r"
  % (n_components, type(n_components))
  )
  elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
  raise ValueError(
  "n_components=%r must be strictly less than "
  "min(n_samples, n_features)=%r with "
  "svd_solver='%s'"
  % (n_components, min(n_samples, n_features), svd_solver)
  )
   
  random_state = check_random_state(self.random_state)
   
  # Center data
  self.mean_ = np.mean(X, axis=0)
  X -= self.mean_
   
  if svd_solver == "arpack":
  v0 = _init_arpack_v0(min(X.shape), random_state)
  U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
  # svds doesn't abide by scipy.linalg.svd/randomized_svd
  # conventions, so reverse its outputs.
  S = S[::-1]
  # flip eigenvectors' sign to enforce deterministic output
  U, Vt = svd_flip(U[:, ::-1], Vt[::-1])
   
  elif svd_solver == "randomized":
  # sign flipping is done inside
  U, S, Vt = randomized_svd(
  X,
  n_components=n_components,
  n_oversamples=self.n_oversamples,
  n_iter=self.iterated_power,
  flip_sign=True,
  random_state=random_state,
  )
   
  self.n_samples_, self.n_features_ = n_samples, n_features
  self.components_ = Vt
  self.n_components_ = n_components
   
  # Get variance explained by singular values
  self.explained_variance_ = (S ** 2) / (n_samples - 1)
  total_var = np.var(X, ddof=1, axis=0)
  self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
  self.singular_values_ = S.copy()  # Store the singular values.
   
  if self.n_components_ < min(n_features, n_samples):
  self.noise_variance_ = total_var.sum() - self.explained_variance_.sum()
  self.noise_variance_ /= min(n_features, n_samples) - n_components
  else:
  self.noise_variance_ = 0.0
   
  return U, S, Vt

randomized_svd는 파라미터 값을 넣으면 특이값 분해 를 계산해준다.

함수 정의에서 n_components 는 추출될 값이나 벡터 개수

 

 

 

 

 

 

[출처]

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html

 

 

 

 

 

 

def randomized_svd(
  M,
  n_components,
  *,
  n_oversamples=10,
  n_iter="auto",
  power_iteration_normalizer="auto",
  transpose="auto",
  flip_sign=True,
  random_state="warn",
  ):
   
   
   
  if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):
  warnings.warn(
  "Calculating SVD of a {} is expensive. "
  "csr_matrix is more efficient.".format(type(M).__name__),
  sparse.SparseEfficiencyWarning,
  )
   
  if random_state == "warn":
  warnings.warn(
  "If 'random_state' is not supplied, the current default "
  "is to use 0 as a fixed seed. This will change to  "
  "None in version 1.2 leading to non-deterministic results "
  "that better reflect nature of the randomized_svd solver. "
  "If you want to silence this warning, set 'random_state' "
  "to an integer seed or to None explicitly depending "
  "if you want your code to be deterministic or not.",
  FutureWarning,
  )
  random_state = 0
   
  random_state = check_random_state(random_state)
  n_random = n_components + n_oversamples
  n_samples, n_features = M.shape
   
  if n_iter == "auto":
  # Checks if the number of iterations is explicitly specified
  # Adjust n_iter. 7 was found a good compromise for PCA. See #5299
  n_iter = 7 if n_components < 0.1 * min(M.shape) else 4
   
  if transpose == "auto":
  transpose = n_samples < n_features
  if transpose:
  # this implementation is a bit faster with smaller shape[1]
  M = M.T
   
  Q = randomized_range_finder(
  M,
  size=n_random,
  n_iter=n_iter,
  power_iteration_normalizer=power_iteration_normalizer,
  random_state=random_state,
  )
   
  # project M to the (k + p) dimensional space using the basis vectors
  B = safe_sparse_dot(Q.T, M)
   
  # compute the SVD on the thin matrix: (k + p) wide
  Uhat, s, Vt = linalg.svd(B, full_matrices=False)
   
  del B
  U = np.dot(Q, Uhat)
   
  if flip_sign:
  if not transpose:
  U, Vt = svd_flip(U, Vt)
  else:
  # In case of transpose u_based_decision=false
  # to actually flip based on u and not v.
  U, Vt = svd_flip(U, Vt, u_based_decision=False)
   
  if transpose:
  # transpose back the results according to the input convention
  return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T
  else:
  return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 

 

randomized_range_finder :Computes an orthonormal matrix whose range approximates the range of A

safe_sparse_dot : Dot product that handle the sparse matrix case correctly.

linalg.svd: When a is a 2D array, it is factorized as u @ np.diag(s) @ vh = (u * s) @ vh, where u and vhare 2D unitary arrays and s is a 1D array of a’s singular values. When a is higher-dimensional, SVD is applied in stacked mode as explained below.

svd_flip : Sign correction to ensure deterministic output from SVD.

Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.

 

 

 

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_range_finder.html

https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html

https://www.kite.com/python/docs/sklearn.utils.extmath.svd_flip

 

 

 

 

 

def fit_transform(self, X, y=None):

U, S, Vt = self._fit(X)

U = U[:, : self.n_components_]

if self.whiten:

# X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)

U *= sqrt(X.shape[0] - 1)

else:

# X_new = X * V = U * S * Vt * V = U * S

U *= S[: self.n_components_]



return U