解决的问题:分类问题
算法:KNN,K-means
KNN算法的实现
# ! pip install scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split #划分训练集和测试集函数
data = load_digits()
#data
2.探索数据集
import pandas as pd
pd.DataFrame(data.data,columns=data.feature_names).head()
# 把数据集转化成一张表
pixel_0_0 | pixel_0_1 | pixel_0_2 | pixel_0_3 | pixel_0_4 | pixel_0_5 | pixel_0_6 | pixel_0_7 | pixel_1_0 | pixel_1_1 | … | pixel_6_6 | pixel_6_7 | pixel_7_0 | pixel_7_1 | pixel_7_2 | pixel_7_3 | pixel_7_4 | pixel_7_5 | pixel_7_6 | pixel_7_7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 5.0 | 13.0 | 9.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 13.0 | 10.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 12.0 | 13.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 16.0 | 10.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 4.0 | 15.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 11.0 | 16.0 | 9.0 | 0.0 |
3 | 0.0 | 0.0 | 7.0 | 15.0 | 13.0 | 1.0 | 0.0 | 0.0 | 0.0 | 8.0 | … | 9.0 | 0.0 | 0.0 | 0.0 | 7.0 | 13.0 | 13.0 | 9.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 1.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 16.0 | 4.0 | 0.0 | 0.0 |
5 rows × 64 columns
x=data.data
y=data.target
import numpy as np
np.unique(y) #探索标签类别
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
3.切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest=train_test_split(x,y #特征和标签
,test_size=0.3 #测试集所占比例
,random_state=1)
Xtrain.shape
(1257, 64)
Xtest.shape
(540, 64)
4.建立模型和评估模型
clf= KNeighborsClassifier() # 实例化模型
clf=clf.fit(Xtrain,Ytrain) # 使用训练集训练模型
score=clf.score(Xtest,Ytest) # 模型评估 预测效果
score
0.9907407407407407
Kmeans聚类的实现
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
data = pd.read_csv('data/testSet.txt',sep='\t',header=None)
data.head()
0 | 1 | |
---|---|---|
0 | 1.658985 | 4.285136 |
1 | -3.453687 | 3.424321 |
2 | 4.838138 | -1.151539 |
3 | -5.379713 | -3.362104 |
4 | 0.972564 | 2.924086 |
data.shape
(80, 2)
# 探索数据的分布
plt.scatter(data.iloc[:,0],data.iloc[:,1])
<matplotlib.collections.PathCollection at 0x1c628dd43d0>
使用Kmeans算法进行分类(fit 后调用labels_)
cluster =KMeans(n_clusters=4)
cluster.fit(data)
KMeans(n_clusters=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=4)
# 调用模型的属性查看建模后的样本信息
y_pred= cluster.labels_ #通过labels_属性查看聚类后每个样本对应的类
y_pred
array([0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2,
3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1,
0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2,
3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1])
centroid= cluster.cluster_centers_ # 通过属性cluster.centers_ 查看各类的质心
centroid
array([[ 2.6265299 , 3.10868015],
[-3.38237045, -2.9473363 ],
[-2.46154315, 2.78737555],
[ 2.80293085, -2.7315146 ]])
# 绘制不同簇的样本
color =['red','green','blue','yellow']
for i in range(4):
plt.scatter(data.loc[y_pred==i,0],data.loc[y_pred==i,1],c=color[i])
# 绘制质心
plt.scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black')
plt.show()
使用Kmeans算法进行分类(直接用接口predict 或 fit_predict)
cluster = KMeans(n_clusters=4,random_state=0)
cluster.fit(data)
# predict
y_pred1=cluster.predict(data)
y_pred1
array([0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1,
0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1])
# fit_predict
y_pred2=cluster.fit_predict(data)
y_pred2
array([0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1,
0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1])
当数据量很大时,为了节省计算资源,需要以精度的代价 使用少量样本数据去确定质心,剩下的数据样调用predict接口来完成簇的分类
fig,axs =plt.subplots(1,2,figsize=(10,4))
# 使用完整数据建模
cluster =KMeans(n_clusters=4,random_state=0).fit(data)
y_pred= cluster.labels_
color =['red','green','blue','purple']
for i in range(4):# 绘制簇
axs[0].scatter(data.loc[y_pred==i,0],data.loc[y_pred==i,1],c=color[i])
axs[0].scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black') # 绘制质心
# 使用部分数据建模
cluster_smallsub =KMeans(n_clusters=4,random_state=0).fit(data[:10]) #sub10个样本
y_pred_1= cluster_smallsub.predict(data)
for i in range(4):# 绘制簇
axs[1].scatter(data.loc[y_pred_1==i,0],data.loc[y_pred_1==i,1],c=color[i])
axs[1].scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black') # 绘制质心
plt.show()