Teqing Cai

Value Investing | Data Science


  • 首页

  • 笔记

  • 归档

  • 搜索
close

逻辑回归

时间: 2022-03-16   |   阅读: 2174 字 ~5分钟

使用逻辑回归完成社交网络广告推荐的预测

内容概要

1 导入数据,了解数据基本信息
2 探索标签和特征分布状况
3 数据预处理
4 初步建模,建立benchmark
5 模型调优
 5.1 数据量纲统一
 5.2 使用交叉验证选取最优参数
 5.3 改变样本权重
6 得到最终模型

1 导入数据,了解数据基本信息

# 导入所需模块和包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,auc

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('ggplot') # 更改绘图风格,R语言绘图库的风格
plt.rcParams['font.sans-serif']='SimHei'  ##设置中文显示

# 导入原始数据集
df = pd.read_csv("data/Social_Network_Ads.csv")
df.head()
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0
# 查看数据基本信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB
df.duplicated().sum() # 查看数据是否含有缺失值
0
len(np.unique(df["User ID"])) == df.shape[0] # 查看用户ID是否有重复值
True

2 探索标签和特征分布状况

df.describe()
User ID Age EstimatedSalary Purchased
count 4.000000e+02 400.000000 400.000000 400.000000
mean 1.569154e+07 37.655000 69742.500000 0.357500
std 7.165832e+04 10.482877 34096.960282 0.479864
min 1.556669e+07 18.000000 15000.000000 0.000000
25% 1.562676e+07 29.750000 43000.000000 0.000000
50% 1.569434e+07 37.000000 70000.000000 0.000000
75% 1.575036e+07 46.000000 88000.000000 1.000000
max 1.581524e+07 60.000000 150000.000000 1.000000
df["Purchased"].value_counts()
0    257
1    143
Name: Purchased, dtype: int64
df["Purchased"].value_counts().plot(kind='pie',autopct='%.2f%%');
png
png
# 查看性别分布状况
df["Gender"].value_counts().plot(kind="bar");
png
png
# 不同性别的用户中有购买意向和没有购买意向的分布状况
data = df.groupby(by=["Gender","Purchased"],as_index=False).size()
data = data.pivot(index="Gender",columns="Purchased",values="size")
data.plot(kind="bar",stacked=True);
png
png
plt.figure(figsize=(12,4))
plt.subplot(121)
df["Age"].plot(kind="hist") # 查看年龄分布状况
plt.scatter(df["Age"],df["Purchased"]*100,color="red",s=10,alpha=0.2);
plt.title("年龄分布状况")

plt.subplot(122)
df["EstimatedSalary"].plot(kind="hist") # 查看收入状况
plt.scatter(df["EstimatedSalary"],df["Purchased"]*100,color="red",s=10,alpha=0.2);
plt.title("收入分布状况")
Text(0.5, 1.0, '收入分布状况')
png
png

3 数据预处理

# 将文本型数据编码
df["Gender"] = df["Gender"].map({"Male":0,"Female":1})
df.head()
User ID Gender Age EstimatedSalary Purchased
0 15624510 0 19 19000 0
1 15810944 0 35 20000 0
2 15668575 1 26 43000 0
3 15603246 1 27 57000 0
4 15804002 0 19 76000 0

4 初步建模,建立benchmark

# 提取特征矩阵和标签
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
X.head()
Gender Age EstimatedSalary
0 0 19 19000
1 0 35 20000
2 1 26 43000
3 1 27 57000
4 0 19 76000
# 切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=12)
# 重置训练集和测试集的索引
for i in [Xtrain,Xtest,Ytrain,Ytest]:
    i.reset_index(drop=True,inplace=True)
clf = LR()
clf = clf.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf.score(Xtest,Ytest))
训练集上的预测准确率为: 0.6571428571428571
测试集上的预测准确率为: 0.6083333333333333
confusion_matrix(Ytrain,clf.predict(Xtrain))
array([[184,   0],
       [ 96,   0]], dtype=int64)
area = roc_auc_score(Ytrain,clf.predict_proba(Xtrain)[:,1])
area
0.3393625452898551
print(classification_report(Ytrain,clf.predict(Xtrain)))
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       184
           1       0.00      0.00      0.00        96

    accuracy                           0.66       280
   macro avg       0.33      0.50      0.40       280
weighted avg       0.43      0.66      0.52       280

5 模型调优

5.1 数据量纲统一

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
# 对训练集进行归一化处理
mms = MinMaxScaler()
mms = mms.fit(Xtrain.iloc[:,1:])
Xtrain_mms = pd.DataFrame(mms.transform(Xtrain.iloc[:,1:]),columns=Xtrain.columns[1:])
Xtrain = pd.concat([Xtrain.iloc[:,0],Xtrain_mms],axis=1)
Xtrain.head()
Gender Age EstimatedSalary
0 1 0.333333 1.000000
1 1 0.190476 0.148148
2 0 0.452381 0.274074
3 1 0.238095 0.511111
4 0 0.500000 0.674074
# 对测试集进行归一化处理
Xtest_mms = pd.DataFrame(mms.transform(Xtest.iloc[:,1:]),columns=Xtest.columns[1:])
Xtest = pd.concat([Xtest.iloc[:,0],Xtest_mms],axis=1)
Xtest.head()
Gender Age EstimatedSalary
0 1 0.976190 0.540741
1 0 0.500000 0.414815
2 1 0.738095 0.933333
3 0 0.404762 0.037037
4 0 0.404762 0.088889
clf1 = LR()
clf1 = clf1.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf1.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf1.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8321428571428572
测试集上的预测准确率为: 0.8166666666666667
confusion_matrix(Ytrain,clf1.predict(Xtrain))
array([[177,   7],
       [ 40,  56]], dtype=int64)
roc_auc_score(Ytrain,clf1.predict(Xtrain))
0.7726449275362319
print(classification_report(Ytrain,clf1.predict(Xtrain)))
              precision    recall  f1-score   support

           0       0.82      0.96      0.88       184
           1       0.89      0.58      0.70        96

    accuracy                           0.83       280
   macro avg       0.85      0.77      0.79       280
weighted avg       0.84      0.83      0.82       280

5.2 使用交叉验证选取最优参数

# 绘制学习曲线选取最优参数C

s_mean = []
Crange = np.linspace(0.001,15,100)
for i in Crange:
    clf = LR(C=i)   
    s_mean.append(cross_val_score(clf,Xtrain,Ytrain,cv=5).mean())

print(f"当C={Crange[s_mean.index(max(s_mean))]}时,准确率最高{max(s_mean)}")
plt.plot(Crange,s_mean,c="r");
当C=9.394313131313131时,准确率最高0.8714285714285713
png
png
# 使用最优参数建模
clf2 = LR(C=9.4)
clf2 = clf2.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf2.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf2.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8714285714285714
测试集上的预测准确率为: 0.825
confusion_matrix(Ytrain,clf2.predict(Xtrain))
array([[175,   9],
       [ 27,  69]], dtype=int64)
roc_auc_score(Ytrain,clf2.predict(Xtrain))
0.8349184782608696
print(classification_report(Ytrain,clf2.predict(Xtrain)))
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       184
           1       0.88      0.72      0.79        96

    accuracy                           0.87       280
   macro avg       0.88      0.83      0.85       280
weighted avg       0.87      0.87      0.87       280

5.3 改变样本权重

clf3 = LR(C=9.4,class_weight="balanced")
clf3 = clf3.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf3.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf3.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8678571428571429
测试集上的预测准确率为: 0.8083333333333333
confusion_matrix(Ytrain,clf3.predict(Xtrain))
array([[160,  24],
       [ 13,  83]], dtype=int64)
roc_auc_score(Ytrain,clf3.predict(Xtrain))
0.8670742753623188
print(classification_report(Ytrain,clf3.predict(Xtrain)))
              precision    recall  f1-score   support

           0       0.92      0.87      0.90       184
           1       0.78      0.86      0.82        96

    accuracy                           0.87       280
   macro avg       0.85      0.87      0.86       280
weighted avg       0.87      0.87      0.87       280
# 如果是不惜一切代价提升Recall的话……
clf4 = LR(C=9.4,class_weight={1:10})
clf4 = clf4.fit(Xtrain,Ytrain)
print(classification_report(Ytrain,clf4.predict(Xtrain)))
              precision    recall  f1-score   support

           0       0.99      0.59      0.74       184
           1       0.56      0.99      0.71        96

    accuracy                           0.73       280
   macro avg       0.77      0.79      0.73       280
weighted avg       0.84      0.73      0.73       280

6 得到最终模型

# 导入所需模块和包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,auc

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
# 导入原始数据并进行数据预处理
df = pd.read_csv("data/Social_Network_Ads.csv")
df["Gender"] = df["Gender"].map({"Male":0,"Female":1})

# 提取特征矩阵和标签
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

# 切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=12)

# 重置训练集和测试集的索引
for i in [Xtrain,Xtest,Ytrain,Ytest]:
    i.reset_index(drop=True,inplace=True)
    

# 对训练集进行归一化处理
mms = MinMaxScaler()
mms = mms.fit(Xtrain.iloc[:,1:])
Xtrain_mms = pd.DataFrame(mms.transform(Xtrain.iloc[:,1:]),columns=Xtrain.columns[1:])
Xtrain = pd.concat([Xtrain.iloc[:,0],Xtrain_mms],axis=1)


# 对测试集进行归一化处理
Xtest_mms = pd.DataFrame(mms.transform(Xtest.iloc[:,1:]),columns=Xtest.columns[1:])
Xtest = pd.concat([Xtest.iloc[:,0],Xtest_mms],axis=1)
clf = LR(C=9.4,class_weight="balanced")
clf = clf.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8678571428571429
测试集上的预测准确率为: 0.8083333333333333
confusion_matrix(Ytest,clf.predict(Xtest))
array([[55, 18],
       [ 5, 42]], dtype=int64)
area = roc_auc_score(Ytest,clf.predict(Xtest))
area
0.8235208394054211
print(classification_report(Ytest,clf.predict(Xtest)))
              precision    recall  f1-score   support

           0       0.92      0.75      0.83        73
           1       0.70      0.89      0.79        47

    accuracy                           0.81       120
   macro avg       0.81      0.82      0.81       120
weighted avg       0.83      0.81      0.81       120
FPR, recall, thresholds = roc_curve(Ytest,clf.predict_proba(Xtest)[:,1])
# 绘制ROC曲线
plt.figure()
plt.plot(FPR, recall, color='red',
         label='ROC curve (area = %0.2f)' % area)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()
png
png
#算法#
爬取数据
KNN、Kmeans实例
  • 文章目录
  • 站点概览

真诚的生活着

8 日志
3 分类
2 标签
© 2020 - 2023 Teqing Cai
0%