KNN、Kmeans实例

解决的问题：分类问题
算法：KNN,K-means

KNN算法的实现

# ! pip install scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split #划分训练集和测试集函数

data = load_digits()
#data

2.探索数据集

import pandas as pd
pd.DataFrame(data.data,columns=data.feature_names).head()
# 把数据集转化成一张表

	pixel_0_2	pixel_0_3	pixel_0_4	pixel_0_5	pixel_1_1	…	pixel_6_6	pixel_7_2	pixel_7_3	pixel_7_4	pixel_7_5	pixel_7_6
0	5.0	13.0	9.0	1.0	0.0	…	0.0	6.0	13.0	10.0	0.0	0.0
1	0.0	12.0	13.0	5.0	0.0	…	0.0	0.0	11.0	16.0	10.0	0.0
2	0.0	4.0	15.0	12.0	0.0	…	5.0	0.0	3.0	11.0	16.0	9.0
3	7.0	15.0	13.0	1.0	8.0	…	9.0	7.0	13.0	13.0	9.0	0.0
4	0.0	1.0	11.0	0.0	0.0	…	0.0	0.0	2.0	16.0	4.0	0.0

5 rows × 64 columns

x=data.data
y=data.target
import numpy as np
np.unique(y) #探索标签类别

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

3.切分训练集和测试集

Xtrain,Xtest,Ytrain,Ytest=train_test_split(x,y #特征和标签
                                          ,test_size=0.3 #测试集所占比例
                                          ,random_state=1)
Xtrain.shape

(1257, 64)

Xtest.shape

(540, 64)

4.建立模型和评估模型

clf= KNeighborsClassifier() # 实例化模型
clf=clf.fit(Xtrain,Ytrain) # 使用训练集训练模型
score=clf.score(Xtest,Ytest) # 模型评估 预测效果
score

0.9907407407407407

Kmeans聚类的实现

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
data = pd.read_csv('data/testSet.txt',sep='\t',header=None)
data.head()

	0	1
0	1.658985	4.285136
1	-3.453687	3.424321
2	4.838138	-1.151539
3	-5.379713	-3.362104
4	0.972564	2.924086

data.shape

(80, 2)

# 探索数据的分布
plt.scatter(data.iloc[:,0],data.iloc[:,1])

<matplotlib.collections.PathCollection at 0x1c628dd43d0>

png

使用Kmeans算法进行分类(fit 后调用labels_)

cluster =KMeans(n_clusters=4)
cluster.fit(data)

KMeans(n_clusters=4)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# 调用模型的属性查看建模后的样本信息
y_pred= cluster.labels_ #通过labels_属性查看聚类后每个样本对应的类
y_pred

array([0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2,
       3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1,
       0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2,
       3, 1, 0, 2, 3, 1, 0, 2, 3, 1, 0, 2, 3, 1])

centroid= cluster.cluster_centers_ # 通过属性cluster.centers_ 查看各类的质心
centroid

array([[ 2.6265299 ,  3.10868015],
       [-3.38237045, -2.9473363 ],
       [-2.46154315,  2.78737555],
       [ 2.80293085, -2.7315146 ]])

# 绘制不同簇的样本
color =['red','green','blue','yellow']
for i in range(4):
    plt.scatter(data.loc[y_pred==i,0],data.loc[y_pred==i,1],c=color[i])

# 绘制质心
plt.scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black')
plt.show()

png

使用Kmeans算法进行分类(直接用接口predict 或 fit_predict)

cluster = KMeans(n_clusters=4,random_state=0)
cluster.fit(data)
# predict
y_pred1=cluster.predict(data)
y_pred1

array([0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
       2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1,
       0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
       2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1])

# fit_predict
y_pred2=cluster.fit_predict(data)
y_pred2

array([0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
       2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1,
       0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
       2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1])

当数据量很大时，为了节省计算资源，需要以精度的代价使用少量样本数据去确定质心，剩下的数据样调用predict接口来完成簇的分类

fig,axs =plt.subplots(1,2,figsize=(10,4))

# 使用完整数据建模
cluster =KMeans(n_clusters=4,random_state=0).fit(data)
y_pred= cluster.labels_
color =['red','green','blue','purple']
for i in range(4):# 绘制簇
    axs[0].scatter(data.loc[y_pred==i,0],data.loc[y_pred==i,1],c=color[i])   
axs[0].scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black') # 绘制质心

# 使用部分数据建模
cluster_smallsub =KMeans(n_clusters=4,random_state=0).fit(data[:10]) #sub10个样本
y_pred_1= cluster_smallsub.predict(data)
for i in range(4):# 绘制簇
    axs[1].scatter(data.loc[y_pred_1==i,0],data.loc[y_pred_1==i,1],c=color[i])   
axs[1].scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black') # 绘制质心

plt.show()

png