AI > Machine Learning

from sklearn.decomposition import PCA

' X is dataset '

pca = PCA(n_components = 2).fit(X)

X_pca = pca.transform(X)

이 두줄 의미 파악하기

========================================================

X = (960,6)이라고 가정하고, 아래의 코드를 참조했을 때 PCA(n_components = 2).fit(X)라는 한 줄은

randomized_svd(X, n_components=2, *, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', transpose='auto', flip_sign=True, random_state='warn')를 따른다고 할 수 있다.

이 함수 내의 계산을 보면

Q = randomized_range_finder(X,size = n_random, n_iter = n_iter, power_iteration_nomalizer = power_iteration_normalizer, random_state = random_state)

## Q = X의 직교 행렬

B = safe_sparse_dot(Q.T, X)

## B = X의 직교행렬의 transpose * X

Uhat, s, Vt = linalg.svd(B, full_matrices=False)

######

Uhat : Unitary array(s). The first a.ndim - 2 dimensions have the same size as those of the input B

s : Vector(s) with the singular values, within each vector sorted in descending order. The first a.ndim - 2 dimensions have the same size as those of the input B

Vt : Unitary array(s). The first a.ndim - 2 dimensions have the same size as those of the input B

B = (Uhat * s) @ Vt

#######

U = np.dot(Q, Uhat)

## U = Q * Uhat

U, Vt = svd_flip(U, Vt)

#######

svd_flip : Sign correction to ensure deterministic output from SVD.

Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.

#######

return U[:, :n_components], s[:n_components], Vt[:n_components, :]

fit_transform(X)

U, S, Vt = self._fit(X)

U = U[:, : self.n_components_]

U *= S[: self.n_components_]

return U

[출처]

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_range_finder.html

https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html

https://www.kite.com/python/docs/sklearn.utils.extmath.svd_flip

===============================================================================

def fit(self,X,y=None):

class PCA(_BasePCA):

def __init__(
	self,
	n_components=None,
	*,
	copy=True,
	whiten=False,
	svd_solver="auto",
	tol=0.0,
	iterated_power="auto",
	n_oversamples=10,
	random_state=None,

PCA(n_components = 2).fit(X) 중

self.n_components=2 로 객체 생성

def fit(self, X, y=None):

	check_scalar(
	self.n_oversamples,
	"n_oversamples",
	min_val=1,
	target_type=numbers.Integral,
	)

	self._fit(X)
	return self

sklearn.utils.check_scalar(x, name, target_type, *, min_val=None, max_val=None):스칼라 매개 변수 유형 및 값의 유효성을 검사합니다.

n_oversamples의 파라미터를 지정 안했으니, self.n_oversamples=10이고 oversamples가 str값으로 n_oversamples이고 최솟값 1을 가지며 데이터 타입이 numbers.Integral이 아니면 에러 발생시킴

fit을 하니 _fit로 넘어감

def _fit(self, X):
	"""Dispatch to the right submethod depending on the chosen solver."""

	# Raise an error for sparse input.
	# This is more informative than the generic one raised by check_array.
	if issparse(X):
	raise TypeError(
	"PCA does not support sparse input. See "
	"TruncatedSVD for a possible alternative."
	)

	X = self._validate_data(
	X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy
	)

	# Handle n_components==None
	if self.n_components is None:
	if self.svd_solver != "arpack":
	n_components = min(X.shape)
	else:
	n_components = min(X.shape) - 1
	else:
	n_components = self.n_components

	# Handle svd_solver
	self._fit_svd_solver = self.svd_solver
	if self._fit_svd_solver == "auto":
	# Small problem or n_components == 'mle', just call full PCA
	if max(X.shape) <= 500 or n_components == "mle":
	self._fit_svd_solver = "full"
	elif n_components >= 1 and n_components < 0.8 * min(X.shape):
	self._fit_svd_solver = "randomized"
	# This is also the case of n_components in (0,1)
	else:
	self._fit_svd_solver = "full"

	# Call different fits for either full or truncated SVD
	if self._fit_svd_solver == "full":
	return self._fit_full(X, n_components)
	elif self._fit_svd_solver in ["arpack", "randomized"]:
	return self._fit_truncated(X, n_components, self._fit_svd_solver)
	else:
	raise ValueError(
	"Unrecognized svd_solver='{0}'".format(self._fit_svd_solver)
	)

svd는 특이값 분해. 고윳값 분해는 정방행렬만 다뤘다면 특이값 분해는 "정방"이라는 제약조건에서 일반화시킨거라 생각하면 된다.

0.8 * min(X.shape) = 4.8이고 self.n_components = 2이므로 elif n_components >= 1 and n_components < 0.8 * min(X.shape):를 따른다.

따라서 self._fit_svd_solver = "randomized"

self._fit_truncated(X, n_components, self._fit_svd_solver)를 조사하면 된다.

[출처] 특이값 분해 : https://darkpgmr.tistory.com/106

def _fit_truncated(self, X, n_components, svd_solver):



	n_samples, n_features = X.shape

	if isinstance(n_components, str):
	raise ValueError(
	"n_components=%r cannot be a string with svd_solver='%s'"
	% (n_components, svd_solver)
	)
	elif not 1 <= n_components <= min(n_samples, n_features):
	raise ValueError(
	"n_components=%r must be between 1 and "
	"min(n_samples, n_features)=%r with "
	"svd_solver='%s'"
	% (n_components, min(n_samples, n_features), svd_solver)
	)
	elif not isinstance(n_components, numbers.Integral):
	raise ValueError(
	"n_components=%r must be of type int "
	"when greater than or equal to 1, was of type=%r"
	% (n_components, type(n_components))
	)
	elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
	raise ValueError(
	"n_components=%r must be strictly less than "
	"min(n_samples, n_features)=%r with "
	"svd_solver='%s'"
	% (n_components, min(n_samples, n_features), svd_solver)
	)

	random_state = check_random_state(self.random_state)

	# Center data
	self.mean_ = np.mean(X, axis=0)
	X -= self.mean_

	if svd_solver == "arpack":
	v0 = _init_arpack_v0(min(X.shape), random_state)
	U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
	# svds doesn't abide by scipy.linalg.svd/randomized_svd
	# conventions, so reverse its outputs.
	S = S[::-1]
	# flip eigenvectors' sign to enforce deterministic output
	U, Vt = svd_flip(U[:, ::-1], Vt[::-1])

	elif svd_solver == "randomized":
	# sign flipping is done inside
	U, S, Vt = randomized_svd(
	X,
	n_components=n_components,
	n_oversamples=self.n_oversamples,
	n_iter=self.iterated_power,
	flip_sign=True,
	random_state=random_state,
	)

	self.n_samples_, self.n_features_ = n_samples, n_features
	self.components_ = Vt
	self.n_components_ = n_components

	# Get variance explained by singular values
	self.explained_variance_ = (S ** 2) / (n_samples - 1)
	total_var = np.var(X, ddof=1, axis=0)
	self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
	self.singular_values_ = S.copy() # Store the singular values.

	if self.n_components_ < min(n_features, n_samples):
	self.noise_variance_ = total_var.sum() - self.explained_variance_.sum()
	self.noise_variance_ /= min(n_features, n_samples) - n_components
	else:
	self.noise_variance_ = 0.0

	return U, S, Vt

randomized_svd는 파라미터 값을 넣으면 특이값 분해 를 계산해준다.

함수 정의에서 n_components 는 추출될 값이나 벡터 개수

[출처]

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html

def randomized_svd(
	M,
	n_components,
	*,
	n_oversamples=10,
	n_iter="auto",
	power_iteration_normalizer="auto",
	transpose="auto",
	flip_sign=True,
	random_state="warn",
	):



	if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):
	warnings.warn(
	"Calculating SVD of a {} is expensive. "
	"csr_matrix is more efficient.".format(type(M).__name__),
	sparse.SparseEfficiencyWarning,
	)

	if random_state == "warn":
	warnings.warn(
	"If 'random_state' is not supplied, the current default "
	"is to use 0 as a fixed seed. This will change to "
	"None in version 1.2 leading to non-deterministic results "
	"that better reflect nature of the randomized_svd solver. "
	"If you want to silence this warning, set 'random_state' "
	"to an integer seed or to None explicitly depending "
	"if you want your code to be deterministic or not.",
	FutureWarning,
	)
	random_state = 0

	random_state = check_random_state(random_state)
	n_random = n_components + n_oversamples
	n_samples, n_features = M.shape

	if n_iter == "auto":
	# Checks if the number of iterations is explicitly specified
	# Adjust n_iter. 7 was found a good compromise for PCA. See #5299
	n_iter = 7 if n_components < 0.1 * min(M.shape) else 4

	if transpose == "auto":
	transpose = n_samples < n_features
	if transpose:
	# this implementation is a bit faster with smaller shape[1]
	M = M.T

	Q = randomized_range_finder(
	M,
	size=n_random,
	n_iter=n_iter,
	power_iteration_normalizer=power_iteration_normalizer,
	random_state=random_state,
	)

	# project M to the (k + p) dimensional space using the basis vectors
	B = safe_sparse_dot(Q.T, M)

	# compute the SVD on the thin matrix: (k + p) wide
	Uhat, s, Vt = linalg.svd(B, full_matrices=False)

	del B
	U = np.dot(Q, Uhat)

	if flip_sign:
	if not transpose:
	U, Vt = svd_flip(U, Vt)
	else:
	# In case of transpose u_based_decision=false
	# to actually flip based on u and not v.
	U, Vt = svd_flip(U, Vt, u_based_decision=False)

	if transpose:
	# transpose back the results according to the input convention
	return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T
	else:
	return U[:, :n_components], s[:n_components], Vt[:n_components, :]

randomized_range_finder :Computes an orthonormal matrix whose range approximates the range of A

safe_sparse_dot : Dot product that handle the sparse matrix case correctly.

linalg.svd: When a is a 2D array, it is factorized as u @ np.diag(s) @ vh = (u * s) @ vh, where u and vhare 2D unitary arrays and s is a 1D array of a’s singular values. When a is higher-dimensional, SVD is applied in stacked mode as explained below.

svd_flip : Sign correction to ensure deterministic output from SVD.

Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.

https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_range_finder.html

https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html

https://www.kite.com/python/docs/sklearn.utils.extmath.svd_flip

def fit_transform(self, X, y=None):
	U, S, Vt = self._fit(X)
	U = U[:, : self.n_components_]
	if self.whiten:
	# X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
	U *= sqrt(X.shape[0] - 1)
	else:
	# X_new = X * V = U * S * Vt * V = U * S
	U *= S[: self.n_components_]

	return U

Cloud Eye

AI > Machine Learning < PCA

티스토리툴바