逻辑回归 - Teqing Cai

使用逻辑回归完成社交网络广告推荐的预测

内容概要

1 导入数据，了解数据基本信息
2 探索标签和特征分布状况
3 数据预处理
4 初步建模，建立benchmark
5 模型调优
5.1 数据量纲统一
5.2 使用交叉验证选取最优参数
5.3 改变样本权重
6 得到最终模型

1 导入数据，了解数据基本信息

# 导入所需模块和包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,auc

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('ggplot') # 更改绘图风格，R语言绘图库的风格
plt.rcParams['font.sans-serif']='SimHei'  ##设置中文显示

# 导入原始数据集
df = pd.read_csv("data/Social_Network_Ads.csv")
df.head()

	User ID	Gender	Age	EstimatedSalary
0	15624510	Male	19	19000
1	15810944	Male	35	20000
2	15668575	Female	26	43000
3	15603246	Female	27	57000
4	15804002	Male	19	76000

# 查看数据基本信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB

df.duplicated().sum() # 查看数据是否含有缺失值

len(np.unique(df["User ID"])) == df.shape[0] # 查看用户ID是否有重复值

True

2 探索标签和特征分布状况

df.describe()

	User ID	Age	EstimatedSalary	Purchased
count	4.000000e+02	400.000000	400.000000	400.000000
mean	1.569154e+07	37.655000	69742.500000	0.357500
std	7.165832e+04	10.482877	34096.960282	0.479864
min	1.556669e+07	18.000000	15000.000000	0.000000
25%	1.562676e+07	29.750000	43000.000000	0.000000
50%	1.569434e+07	37.000000	70000.000000	0.000000
75%	1.575036e+07	46.000000	88000.000000	1.000000
max	1.581524e+07	60.000000	150000.000000	1.000000

df["Purchased"].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

df["Purchased"].value_counts().plot(kind='pie',autopct='%.2f%%');

png

# 查看性别分布状况
df["Gender"].value_counts().plot(kind="bar");

png

# 不同性别的用户中有购买意向和没有购买意向的分布状况
data = df.groupby(by=["Gender","Purchased"],as_index=False).size()
data = data.pivot(index="Gender",columns="Purchased",values="size")
data.plot(kind="bar",stacked=True);

png

plt.figure(figsize=(12,4))
plt.subplot(121)
df["Age"].plot(kind="hist") # 查看年龄分布状况
plt.scatter(df["Age"],df["Purchased"]*100,color="red",s=10,alpha=0.2);
plt.title("年龄分布状况")

plt.subplot(122)
df["EstimatedSalary"].plot(kind="hist") # 查看收入状况
plt.scatter(df["EstimatedSalary"],df["Purchased"]*100,color="red",s=10,alpha=0.2);
plt.title("收入分布状况")

Text(0.5, 1.0, '收入分布状况')

png

3 数据预处理

# 将文本型数据编码
df["Gender"] = df["Gender"].map({"Male":0,"Female":1})

df.head()

	User ID	Gender	Age	EstimatedSalary
0	15624510	0	19	19000
1	15810944	0	35	20000
2	15668575	1	26	43000
3	15603246	1	27	57000
4	15804002	0	19	76000

4 初步建模，建立benchmark

# 提取特征矩阵和标签
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

X.head()

	Gender	Age	EstimatedSalary
0	0	19	19000
1	0	35	20000
2	1	26	43000
3	1	27	57000
4	0	19	76000

# 切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=12)

# 重置训练集和测试集的索引
for i in [Xtrain,Xtest,Ytrain,Ytest]:
    i.reset_index(drop=True,inplace=True)

clf = LR()
clf = clf.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为：",clf.score(Xtrain,Ytrain))
print("测试集上的预测准确率为：",clf.score(Xtest,Ytest))

训练集上的预测准确率为： 0.6571428571428571
测试集上的预测准确率为： 0.6083333333333333

confusion_matrix(Ytrain,clf.predict(Xtrain))

array([[184,   0],
       [ 96,   0]], dtype=int64)

area = roc_auc_score(Ytrain,clf.predict_proba(Xtrain)[:,1])
area

0.3393625452898551

print(classification_report(Ytrain,clf.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.66      1.00      0.79       184
           1       0.00      0.00      0.00        96

    accuracy                           0.66       280
   macro avg       0.33      0.50      0.40       280
weighted avg       0.43      0.66      0.52       280

5 模型调优

5.1 数据量纲统一

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

# 对训练集进行归一化处理
mms = MinMaxScaler()
mms = mms.fit(Xtrain.iloc[:,1:])
Xtrain_mms = pd.DataFrame(mms.transform(Xtrain.iloc[:,1:]),columns=Xtrain.columns[1:])
Xtrain = pd.concat([Xtrain.iloc[:,0],Xtrain_mms],axis=1)
Xtrain.head()

	Gender	Age	EstimatedSalary
0	1	0.333333	1.000000
1	1	0.190476	0.148148
2	0	0.452381	0.274074
3	1	0.238095	0.511111
4	0	0.500000	0.674074

# 对测试集进行归一化处理
Xtest_mms = pd.DataFrame(mms.transform(Xtest.iloc[:,1:]),columns=Xtest.columns[1:])
Xtest = pd.concat([Xtest.iloc[:,0],Xtest_mms],axis=1)
Xtest.head()

	Gender	Age	EstimatedSalary
0	1	0.976190	0.540741
1	0	0.500000	0.414815
2	1	0.738095	0.933333
3	0	0.404762	0.037037
4	0	0.404762	0.088889

clf1 = LR()
clf1 = clf1.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为：",clf1.score(Xtrain,Ytrain))
print("测试集上的预测准确率为：",clf1.score(Xtest,Ytest))

训练集上的预测准确率为： 0.8321428571428572
测试集上的预测准确率为： 0.8166666666666667

confusion_matrix(Ytrain,clf1.predict(Xtrain))

array([[177,   7],
       [ 40,  56]], dtype=int64)

roc_auc_score(Ytrain,clf1.predict(Xtrain))

0.7726449275362319

print(classification_report(Ytrain,clf1.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.82      0.96      0.88       184
           1       0.89      0.58      0.70        96

    accuracy                           0.83       280
   macro avg       0.85      0.77      0.79       280
weighted avg       0.84      0.83      0.82       280

5.2 使用交叉验证选取最优参数

# 绘制学习曲线选取最优参数C

s_mean = []
Crange = np.linspace(0.001,15,100)
for i in Crange:
    clf = LR(C=i)   
    s_mean.append(cross_val_score(clf,Xtrain,Ytrain,cv=5).mean())

print(f"当C={Crange[s_mean.index(max(s_mean))]}时,准确率最高{max(s_mean)}")
plt.plot(Crange,s_mean,c="r");

当C=9.394313131313131时,准确率最高0.8714285714285713

png

# 使用最优参数建模
clf2 = LR(C=9.4)
clf2 = clf2.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为：",clf2.score(Xtrain,Ytrain))
print("测试集上的预测准确率为：",clf2.score(Xtest,Ytest))

训练集上的预测准确率为： 0.8714285714285714
测试集上的预测准确率为： 0.825

confusion_matrix(Ytrain,clf2.predict(Xtrain))

array([[175,   9],
       [ 27,  69]], dtype=int64)

roc_auc_score(Ytrain,clf2.predict(Xtrain))

0.8349184782608696

print(classification_report(Ytrain,clf2.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       184
           1       0.88      0.72      0.79        96

    accuracy                           0.87       280
   macro avg       0.88      0.83      0.85       280
weighted avg       0.87      0.87      0.87       280

5.3 改变样本权重

clf3 = LR(C=9.4,class_weight="balanced")
clf3 = clf3.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为：",clf3.score(Xtrain,Ytrain))
print("测试集上的预测准确率为：",clf3.score(Xtest,Ytest))

训练集上的预测准确率为： 0.8678571428571429
测试集上的预测准确率为： 0.8083333333333333

confusion_matrix(Ytrain,clf3.predict(Xtrain))

array([[160,  24],
       [ 13,  83]], dtype=int64)

roc_auc_score(Ytrain,clf3.predict(Xtrain))

0.8670742753623188

print(classification_report(Ytrain,clf3.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.92      0.87      0.90       184
           1       0.78      0.86      0.82        96

    accuracy                           0.87       280
   macro avg       0.85      0.87      0.86       280
weighted avg       0.87      0.87      0.87       280

# 如果是不惜一切代价提升Recall的话……
clf4 = LR(C=9.4,class_weight={1:10})
clf4 = clf4.fit(Xtrain,Ytrain)
print(classification_report(Ytrain,clf4.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.99      0.59      0.74       184
           1       0.56      0.99      0.71        96

    accuracy                           0.73       280
   macro avg       0.77      0.79      0.73       280
weighted avg       0.84      0.73      0.73       280

6 得到最终模型

# 导入所需模块和包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,auc

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

# 导入原始数据并进行数据预处理
df = pd.read_csv("data/Social_Network_Ads.csv")
df["Gender"] = df["Gender"].map({"Male":0,"Female":1})

# 提取特征矩阵和标签
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

# 切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=12)

# 重置训练集和测试集的索引
for i in [Xtrain,Xtest,Ytrain,Ytest]:
    i.reset_index(drop=True,inplace=True)
    

# 对训练集进行归一化处理
mms = MinMaxScaler()
mms = mms.fit(Xtrain.iloc[:,1:])
Xtrain_mms = pd.DataFrame(mms.transform(Xtrain.iloc[:,1:]),columns=Xtrain.columns[1:])
Xtrain = pd.concat([Xtrain.iloc[:,0],Xtrain_mms],axis=1)


# 对测试集进行归一化处理
Xtest_mms = pd.DataFrame(mms.transform(Xtest.iloc[:,1:]),columns=Xtest.columns[1:])
Xtest = pd.concat([Xtest.iloc[:,0],Xtest_mms],axis=1)

clf = LR(C=9.4,class_weight="balanced")
clf = clf.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为：",clf.score(Xtrain,Ytrain))
print("测试集上的预测准确率为：",clf.score(Xtest,Ytest))

训练集上的预测准确率为： 0.8678571428571429
测试集上的预测准确率为： 0.8083333333333333

confusion_matrix(Ytest,clf.predict(Xtest))

array([[55, 18],
       [ 5, 42]], dtype=int64)

area = roc_auc_score(Ytest,clf.predict(Xtest))
area

0.8235208394054211

print(classification_report(Ytest,clf.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.92      0.75      0.83        73
           1       0.70      0.89      0.79        47

    accuracy                           0.81       120
   macro avg       0.81      0.82      0.81       120
weighted avg       0.83      0.81      0.81       120

FPR, recall, thresholds = roc_curve(Ytest,clf.predict_proba(Xtest)[:,1])

# 绘制ROC曲线
plt.figure()
plt.plot(FPR, recall, color='red',
         label='ROC curve (area = %0.2f)' % area)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()

png