import warnings
warnings.filterwarnings('ignore')
from sklearn import datasets
digits = datasets.load_digits()
print(type(digits)) 
# sklearn.utils._bunch.Bunch 是一个在 Scikit-learn 库中使用的类，
# 主要用于将多个属性组合在一起，类似于一个字典，
# 但它允许使用点（.）来访问属性而不是使用方括号（[]）。
print(digits.keys()) 
print(digits.target)

<class 'sklearn.utils._bunch.Bunch'>
dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])
[0 1 2 ... 8 9 8]

import matplotlib.pyplot as plt
import numpy as np
print(digits.data[0].shape)
plt.imshow(np.reshape(digits.images[0], (8,8)), cmap=plt.cm.gray_r)

(64,)

<matplotlib.image.AxesImage at 0x291aaad1050>

from sklearn import svm
# equation of rbf kernel(径向基函数): K(x, y) = exp(-gamma ||x-y||^2)
# gamma is the parameter for the RBF kernel, C is the regularization parameter
clf = svm.SVC(kernel='rbf', gamma=0.001, C=100.) 
clf.fit(digits.data[1:], digits.target[1:])

SVC(C=100.0, gamma=0.001)

SVC(C=100.0, gamma=0.001)

clf.predict(digits.data[0:1])

array([0])

# 生成数据；
# 200个点在500维空间中。稀疏度k=10。
(n_sample, dim, k) = (200, 500, 10)
X = np.random.randn(n_sample, dim)
beta = np.zeros(dim)
inds = np.random.choice(np.arange(dim), k, replace=False)
beta[inds] = 5 * np.random.randn(k)
y = np.dot(X, beta) + 0.1 * np.random.randn(n_sample)

# 分割为训练/测试集。
X_train, y_train = X[:(n_sample // 2)], y[:(n_sample // 2)]
X_test, y_test = X[(n_sample // 2):], y[(n_sample // 2):]

# 基于训练集拟合模型。
from sklearn.linear_model import Lasso
# alpha参数控制我们使用的正则化程度。
lasso = Lasso(alpha=0.1)
# 调用了fit，lasso的系数已经被更新以拟合训练数据
lasso.fit(X_train, y_train)

# make predictions on the test set
y_pred_lasso = lasso.predict(X_test)
from sklearn.metrics import r2_score
# 评估模型在测试数据上的拟合程度。
# 这里的r2_score是R-squared，越接近1越好。
# equation of r2_score: 1 - (sum((y_test - y_pred) ** 2) / sum((y_test - y_test.mean()) ** 2))
r2_score_lasso = r2_score(y_test, y_pred_lasso)
r2_score_lasso

0.9990104863110415

np.where(beta != 0)

(array([  0,   3,  35,  49, 175, 190, 198, 331, 366, 391], dtype=int64),)

# 对目标变量有影响的输入特征
np.where(lasso.coef_!= 0)

(array([  0,   3,  35,  49,  98, 143, 175, 190, 198, 275, 316, 331, 366,
        391], dtype=int64),)

from sklearn.metrics import  f1_score
# equation of f1_score is: 2 * (precision * recall) / (precision + recall)
f1_score(lasso.coef_!=0, beta!=0, average='binary')

0.8333333333333334

import pandas as pd
import seaborn as sns
# 著名的iris数据集。显然数据中存在聚类结构。
# 如何在不使用标签信息的情况下发现它？
iris = datasets.load_iris()
x = iris.data
y = iris.target
y_new = np.array(['setosa' if i == 0 else 'versicolor' if i == 1 else 'virginica' for i in iris.target])

sepal_ratio = x[:,0]/x[:,1] # 萼片的长度除以萼片的宽度
petal_ratio = x[:,2]/x[:,3] # 花瓣的长度除以花瓣的宽度
data = pd.DataFrame({'Sepal Ratio': sepal_ratio, 'Petal Ratio': petal_ratio, 'Class': y_new})

plt1 = sns.jointplot(data=data, x="Sepal Ratio", y="Petal Ratio", hue="Class")

plt2 = sns.jointplot(data=data, x="Sepal Ratio", y="Petal Ratio", hue="Class", kind="kde")

# 将数据建模为高斯混合
# 每个高斯生成一个聚类
# 对于每个聚类，估计均值和协方差
colors = ['red', 'blue']
from sklearn import mixture
R = np.stack([sepal_ratio, petal_ratio], axis=1)
# 每个sklearn模型都支持fit方法。在这种情况下，拟合包括估计n=2个组件的均值和协方差。
# 但通常我们不知道类别，所以我们不知道如何选择n_components。这称为模型选择。
gmm = mixture.GaussianMixture(n_components=2,  covariance_type='full')
gmm.fit(R)

labels = gmm.predict(R)

for i in np.unique(labels):
    plt.scatter(sepal_ratio[labels==i], petal_ratio[labels==i], c=colors[i], label=i)

# 通过 gmm.means_ 可以提取出每个高斯成分的均值
print(gmm.means_)

[[2.1906731  3.02559636]
 [1.45993634 6.98743993]]

import warnings
warnings.filterwarnings('ignore')
from sklearn import mixture
iris = datasets.load_iris()
X = iris.data
y = iris.target
n_components_range = range(1, 7)
covar_types = ['spherical', 'tied', 'diag', 'full']
bics = np.zeros(shape=(len(covar_types), len(n_components_range)))
# 对于不同数量的簇和不同的协方差估计方法，
# 我们将拟合一个具有多簇和使用那种协方差估计方法的GMM。
# 测量每个这样的选择的BIC；将其存储在数组bics中。
for i in range(len(covar_types)):
    cvtype = covar_types[i]
    for j in range(len(n_components_range)):
        n_comps = n_components_range[j]
        gmm = mixture.GaussianMixture(n_components=n_comps, covariance_type=cvtype)
        gmm.fit(X)   
        bics[i, j] = gmm.bic(X)

barwidth = 0.2
inds = np.array(list(n_components_range))
colors = ['blue', 'darkorange', 'teal', 'purple']
# 现在，让我们看看BIC分数。在每种协方差估计方法内，具有最低BIC的组件数量是我们应该选择的。
for i in range(len(colors)):
    plt.bar(inds+i*barwidth, bics[i:][0], color=colors[i], width=barwidth, label=covar_types[i])
plt.xticks(inds+2*barwidth, n_components_range)
plt.title('BIC summary for the main covariance types'); plt.xlabel('Number of components')
plt.ylabel('BIC'); plt.legend(); plt.show()
# full协方差在簇为2时有最低的BIC。
# Tied协方差选择（正确的）簇数量为3。
# 这里的教训不是这些方法中的一种总是最好的，
# 而是即使像BIC这样的原则性技术有时也可能给我们错误的答案。

# CV用于我们选择超参数，如LASSO中的alpha
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
import numpy as np
beta = np.zeros(dim)
inds = np.random.choice(np.arange(dim), size=k, replace=False)
beta[inds] = np.random.randn(k)
(n_sample, dim, k) = (200, 500, 10)
X = np.random.randn(n_sample, dim)
y = np.dot(X, beta) + 0.1*np.random.randn(n_sample)

lasso = Lasso(alpha=0.1)

scores = cross_val_score(lasso, X, y, cv=5, scoring='r2')
scores

array([0.98667861, 0.99112958, 0.98958778, 0.99305271, 0.98491647])

from sklearn.model_selection import cross_val_score
alphavals = np.array([0.1, 0.5, 1.0, 5, 10.0, 50, 100])
mean_scores = np.zeros(alphavals.shape)
std_scores = np.zeros(alphavals.shape)

for i in range(len(alphavals)):
    lasso = Lasso(alpha=alphavals[i])
    scores = cross_val_score(lasso, X, y, cv=5)
    mean_scores[i] = scores.mean()
    std_scores[i] = scores.std()

plt.errorbar(alphavals, mean_scores, yerr=2*std_scores, color='blue', linewidth=3, elinewidth=1)
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('r2 score')
plt.xlabel(r'Lasso regularization parameter $\alpha$')

Text(0.5, 0, 'Lasso regularization parameter $\\alpha$')

gmm = mixture.GaussianMixture(n_components=3, covariance_type='full')
gmm.fit(R)
labels = gmm.predict(R)
for i in np.unique(labels):
    plt.scatter(sepal_ratio[labels==i], petal_ratio[labels==i], c=colors[i])

from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(labels, iris.target)

0.5312290473980091

gmm = mixture.GaussianMixture(n_components=2, covariance_type='full')
gmm.fit(R)
labels = gmm.predict(R)
adjusted_rand_score(labels, iris.target)

0.5437515388376617

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
# 加载 Breast Cancer 数据集
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 创建 SVM 模型并训练
model = SVC(probability=True)  # SVM 模型
model.fit(X_train, y_train)

# 进行预测
y_score = model.predict_proba(X_test)[:, 1]
y_score

array([4.87860948e-04, 9.78488947e-01, 9.59626283e-01, 2.67166094e-01,
       7.97237742e-01, 7.41158789e-01, 5.43943450e-01, 9.75144159e-01,
       9.78391588e-01, 9.52694852e-01, 9.68768695e-01, 9.10080457e-01,
       9.77606951e-01, 9.13912050e-01, 8.62345781e-01, 3.68365042e-03,
       8.32092650e-04, 4.59517857e-04, 9.65460833e-01, 9.43194413e-01,
       9.78614607e-01, 9.78842120e-01, 9.27500798e-01, 9.71162039e-01,
       9.59088171e-01, 6.02648946e-02, 5.14090863e-04, 2.66947599e-04,
       4.31717364e-04, 8.35050943e-01, 9.77600363e-01, 7.96942589e-01,
       9.79257396e-01, 9.76908881e-01, 9.80105669e-01, 4.15938144e-01,
       9.70688961e-01, 9.78013992e-01, 6.64463060e-01, 9.60504503e-01,
       9.72597595e-01, 9.61259094e-01, 9.64739977e-01, 9.78856888e-01,
       2.30831705e-03, 9.21467112e-04, 3.44830268e-04, 4.73094932e-01,
       9.34055492e-01, 1.89599188e-02, 9.24765178e-01, 9.58456720e-01,
       6.26795078e-04, 9.26873857e-01, 9.66862833e-01, 9.47751169e-01,
       1.53365757e-03, 9.74877325e-01, 2.97273074e-02, 8.88913504e-01,
       9.33950819e-04, 9.77679859e-01, 5.34191329e-04, 1.58302405e-02,
       9.74961542e-02, 8.48625882e-01, 9.76883862e-01, 1.50497449e-01,
       9.68974086e-01, 9.55371583e-01, 9.79243766e-01, 1.13934193e-03,
       2.90850573e-04, 9.55865854e-01, 1.79221327e-01, 4.63660916e-01,
       9.66972008e-01, 7.66197736e-03, 9.06877458e-01, 2.09062608e-03,
       8.07464480e-01, 3.57732536e-03, 9.75997570e-01, 9.18074311e-01,
       9.65161368e-01, 9.77395635e-01, 8.67599236e-01, 9.76801146e-01,
       3.33733328e-02, 8.82264435e-01, 8.69491718e-01, 9.81459031e-01,
       5.53680047e-04, 8.92383954e-01, 1.00930715e-02, 9.65015723e-01,
       1.02226695e-02, 9.72293227e-01, 8.45264751e-02, 3.18601325e-01,
       9.81162326e-01, 2.59343105e-04, 4.41423980e-04, 3.56687889e-04,
       6.42088923e-03, 8.61338047e-01, 1.40337682e-01, 5.55692232e-03,
       9.77049828e-01, 5.15856976e-01, 8.90724974e-01, 4.34362448e-03,
       9.80532921e-01, 5.21511985e-03, 2.71257521e-01, 6.66706168e-01,
       9.71581140e-01, 9.66320222e-01, 3.38768258e-03, 9.33725565e-01,
       9.63208860e-01, 9.28893467e-01, 9.29724192e-01, 8.25465418e-01,
       3.44446907e-04, 9.78594607e-01, 9.38190314e-01, 9.80321317e-01,
       9.54650169e-01, 6.87106559e-04, 9.57434771e-01, 3.36236273e-04,
       9.64577229e-01, 1.17577901e-03, 9.46363061e-01, 9.70761087e-01,
       9.77890147e-01, 2.58181664e-01, 2.06086655e-03, 9.71569006e-01,
       2.78974276e-01, 9.76697392e-01, 1.23851317e-02, 9.75562313e-01,
       9.17402577e-01, 9.81373067e-01, 9.81406973e-01, 7.25940874e-01,
       9.75190609e-01, 9.51245135e-01, 9.81871123e-01, 7.33074259e-01,
       9.59910807e-01, 9.62422403e-01, 9.80440269e-01, 8.60770874e-01,
       9.88235767e-04, 2.34496718e-03, 9.13600875e-01, 9.32743743e-01,
       8.47792676e-01, 2.24507690e-04, 9.76141495e-01, 6.02571556e-01,
       4.78874803e-01, 6.48427817e-03, 8.68571659e-01, 8.82155704e-01,
       8.60648470e-01, 9.69966127e-01, 9.67969217e-01])

auc_score = roc_auc_score(y_test, y_score)
fpr, tpr, thresholds = roc_curve(y_test, y_score)
plt.figure()
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = {:.2f})'.format(auc_score))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # 对角线
plt.xlim([0.0, 1.0]); plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.legend(); plt.show()

from sklearn.metrics import precision_recall_curve, average_precision_score
precision, recall, _ = precision_recall_curve(y_test, y_score)
aupr = average_precision_score(y_test, y_score)
# 绘制精确率-召回曲线
plt.figure()
plt.plot(recall, precision, label='Precision-Recall curve (AUPR = {:.2f})'.format(aupr))
plt.xlabel('Recall'); plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(); plt.show()

beta = np.zeros(dim)
inds = np.random.choice(np.arange(dim), size=k, replace=False)
beta[inds] = 5*np.random.randn(k)
(n_sample, dim, k) = (200, 500, 10)
X = np.random.randn(n_sample, dim)
y = np.dot(X, beta) + 0.1*np.random.randn(n_sample)

lasso = Lasso(alpha=1)
lasso.fit(X, y)

xtest = np.random.randn(1, dim)
print(lasso.predict(xtest))

import pickle

s = pickle.dumps(lasso)
ytest = np.dot(xtest, beta) + 0.1*np.random.randn(1)
lasso2 = pickle.loads(s)
ypred2 = lasso.predict(xtest)
print(ypred2)

[9.04203952]
[9.04203952]

Python数据处理¶

13. scikit-learn ¶

scikit-learn¶

示例：scikit-learn中的分类器¶

示例：scikit-learn中的分类器¶

支持向量机（Support Vector Machine, SVM）算法原理¶

支持向量机算法如何工作？¶

数据不是线性可分¶

数据不是线性可分¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

无监督学习示例：sklearn中的k-GMM¶

sklearn中的模型选择¶

sklearn中的交叉验证¶

cross_val_score¶

R2 score¶

评估模型：sklearn.metrics¶

ARI计算¶

评估模型：sklearn.metrics¶

评估模型：sklearn.metrics¶

模型持久性：序列化模型对象¶

Python数据处理¶

13. scikit-learn ¶

scikit-learn¶

示例：scikit-learn中的分类器¶

示例：scikit-learn中的分类器¶

支持向量机（Support Vector Machine, SVM） 算法原理¶

支持向量机算法如何工作？¶

数据不是线性可分¶

数据不是线性可分¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

有监督学习示例：sklearn中的LASSO¶

无监督学习示例：sklearn中的k-GMM¶

sklearn中的模型选择¶

sklearn中的交叉验证¶

cross_val_score¶

R2 score¶

评估模型：sklearn.metrics¶

ARI计算¶

评估模型：sklearn.metrics¶

评估模型：sklearn.metrics¶

模型持久性：序列化模型对象¶

支持向量机（Support Vector Machine, SVM）算法原理¶