重要提示!!!!!
本文中的部分代码都已经公开发表在论文中,不建议直接引用,可能重复比例比较大,任何导致学术造假的后果请浏览者自行承担!!!
为确保无风险,请标明引用!!!*
#coding=utf-8
import re
import codecs
import os, time, sys
f = codecs.open('C:/Users/thous/PycharmProjects/untitled4/hair.txt', 'r', 'utf-8')
# f = codecs.open('F:/parseWord/tmp/F1040EZ.content.txt', 'r', 'utf-8')
# f.set_index(hair_dryer['star_rating'], inplace=True)
s = f.readlines()
f.flush()
f.close()
for fileLine in s:
if u'great' in fileLine:
line_pattern = r'\s*\d+\s?(.*)'
def func(text):
c = re.compile(line_pattern)
lists = []
lines = text.split('\n')
for line in lines:
r = c.findall(line)
if r:
lists.append(r[0])
return '\n'.join(lists)
result = func(fileLine)
print(result)
# result.to_txt('result.txt')
output = sys.stdout
outputfile = open("C:\\Users\\thous\\PycharmProjects\\untitled4\\x.txt", "a")
sys.stdout = outputfile
"""
file = open(r"C:\\Users\\thous\\PycharmProjects\\untitled4\\hair.txt", 'r', encoding='UTF-8')
sum=0
for line in file.readlines():
key = "good"
if key in line:
s = re.findall('"TimeSpan":"([\d.]+)"', line)
print("**************", line)
print("时间为:", s[-1])
sum = sum + float(s[-1])
file.close()
print("总时间为:", sum)
input("123")
"""
# #!-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import xlrd
import matplotlib.pyplot as plt
m = pd.read_excel("C:\\Users\\thous\\PycharmProjects\\untitled4\\m.xlsx")
# df_stock.to_excel('AAL_stock.xlsx',sheet_name='thousand')
m.set_index(m['review_date'], inplace=True)
review_date = m['review_date']
compound = m['compound']
star_rating = m['star_rating']
m['count_rate'] = m['star_rating'].pct_change()
m['count_rate'] .plot(grid=True)
plt.title('star_rating_rate')
plt.show()
m['star_rating'].plot(grid=True)
plt.title('star_rating')
plt.show()
m['compound'].plot(grid=True)
plt.title('compound')
plt.show()
m['compound_rate'] = m['compound'].pct_change()
m['compound_rate'].plot(grid=True)
plt.title('compound_rare')
plt.show()
import pandas as pd
from numpy.random import rand
from numpy.random import seed
from scipy.stats import spearmanr
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics as mt
from sklearn import metrics
from xgboost import plot_importance
import numpy as np
from sklearn.metrics import roc_curve,roc_auc_score, auc, accuracy_score, log_loss
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import sys
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# from statsmodels.tsa.arima_model import ARIMA
# C:\Users\thous\AppData\Local\pip
# seed random number generator
# C:\Users\thous\AppData\Roaming\Python\Python37\site-packages
# statsmodels-0.11.1-cp38-cp38-win_amd64
"""
data = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")
data.info()
# prepare data
# data['letter_num'] = data['letter_num'].astype('float')
# data['star_rating'] = data['star_rating'].astype('float')
letter_num_ln = data['letter_num_ln'].values.reshape(-1, 1)
letter_num = data['letter_num'].values.reshape(-1, 1)
star_rating = data['star_rating'].values.reshape(-1, 1)
X = letter_num_ln
y = star_rating
# print("letter_num_ln", letter_num_ln)
"""
# ################################################################
"""
# calculate spearman's correlation
seed(1)
coef1, p1 = spearmanr(letter_num, star_rating)
print('Spearmans correlation coefficient: %.3f' % coef1)
# interpret the significance
alpha = 0.05
if p1 > alpha:
print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p1)
else:
print('Samples are correlated (reject H0) p=%.3f' % p1)
coef2, p2 = spearmanr(letter_num_ln, star_rating)
print('Spearmans correlation coefficient: %.3f' % coef2)
# interpret the significance
alpha = 0.05
if p2 > alpha:
print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p2)
else:
print('Samples are correlated (reject H0) p=%.3f' % p2)
"""
"""
X_train, X_test, y_train, y_test = train_test_split(letter_num_ln, star_rating, test_size=0.2)
# #############################################################################################
# 建立逻辑回归分类器
model = LogisticRegression()
# 建立递归特征消除筛选器
clf = LogisticRegression() # 构建逻辑回归分类器
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) # 通过分类器产生预测结果
# print("y_pred", y_pred)
print("LR Test set accuracy score:", accuracy_score(y_pred, y_test))
"""
# ########################################################################################
"""
# 一元回归
# ### 统计量参数
X_in = np.array(X).reshape(-1, 1)
y_in = np.array(y).reshape(-1, 1)
lreg = LinearRegression()
lreg.fit(X_in, y_in)
def get_lr_stats(X, y, model):
message0 = '一元线性回归方程为: '+'\ty' + '=' + str(model.intercept_[0])+' + ' +str(model.coef_[0][0]) + '*x'
from scipy import stats
n = len(X)
y_prd = model.predict(X)
Regression = sum((y_prd - np.mean(y))**2) # 回归
Residual = sum((y - y_prd)**2) # 残差
R_square = Regression / (Regression + Residual) # 相关性系数R^2
F = (Regression / 1) / (Residual / ( n - 2 )) # F 分布
pf = stats.f.sf(F, 1, n-2)
message1 = ('相关系数(R^2): ' + str(R_square[0]) + ';' + '\n' +
'回归分析(SSR): ' + str(Regression[0]) + ';' + '\t残差(SSE): ' + str(Residual[0]) + ';' + '\n' +
' F : ' + str(F[0]) + ';' + '\t' + 'pf : ' + str(pf[0]) )
## T
L_xx = n * np.var(X)
sigma = np.sqrt(Residual / n)
t = model.coef_ * np.sqrt(L_xx) / sigma
pt = stats.t.sf(t, n-2)
message2 = ' t : ' + str(t[0][0])+ ';' + '\t' + 'pt : ' + str(pt[0][0])
return print(message0 +'\n' +message1 + '\n'+message2)
get_lr_stats(X_in, y_in, lreg)
# AttributeError: 'LogisticRegression' object has no attribute 'intercept_'
"""
# ####################################################################################
"""
reg = LinearRegression().fit(X, y)
print("一元回归方程为: Y = %.5fX + (%.5f)" % (reg.coef_[0][0], reg.intercept_[0]))
print("R平方为: %s" % reg.score(X, y))
plt.scatter(X, y, color='black')
plt.plot(X, reg.predict(X), color='red', linewidth=1)
plt.title('star_rating review_num_ln LinearRegression')
plt.show()
# plt.plot(X_parameters,regr.predict(X_parameters),color='red', linewidth=4)
"""
# ##############################################################################
data_counts = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_data_counts.csv")
data_counts.set_index(data_counts['date'], inplace=True)
hair_dryer_data_counts = data_counts['count']
data_counts['count'].plot(grid=True)
plt.title('data_counts')
data_counts['count_rate'] = data_counts['count'].pct_change()
data_counts['count_rate'] .plot(grid=True)
plt.title('data_count_rate')
plt.plot(hair_dryer_data_counts)
plt.title('hair_dryer_data_counts')
# plt.xticks(pd.date_range('2002-03', '2015-09'))
# plt.ylim(-5,110)
# plt.scatter(hair_dryer_data_counts, c='red', alpha=0.4, label='hair_dryer_data_counts')
plt.show()
hair_dryer_counts = np.array(hair_dryer_data_counts).reshape(-1, 1)
hair_dryer_data_counts_pred1 = reg.predict(hair_dryer_counts)
np.set_printoptions(threshold=np.inf)
print(hair_dryer_data_counts_pred1) # 10个变量的预测结果
# 低估了
output = sys.stdout
outputfile = open("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_counts.txt", "a")
sys.stdout = outputfile
# ###############################################################
"""
# 预测
hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")
# 查看因变量情况
star_rating_hairdryer = hair_dryer_new['star_rating']
# print("star_rating_hairdryer", hair_dryer_new['star_rating'].value_counts())
sns.countplot(hair_dryer_new.star_rating)
# plt.tick_params(axis='x', labelsize=6)
plt.show()
# 筛选出想要的部分数据,去除中立数据
# data_hair_dryer = hair_dryer_new[hair_dryer_new['compound'] != 0]
# data_hair_dryer = hair_dryer_new[hair_dryer_new['compound'] != 0]
# print("去除中立数据后的data", data)
# #把态starrating编码
def coding(col, codeDict):
colCoded = pd.Series(col, copy=True)
for key, value in codeDict.items():
colCoded.replace(key, value, inplace=True)
return colCoded
hair_dryer_new['verified_purchase'] = coding(hair_dryer_new['verified_purchase'], {'Y': 1, 'N': 0})
print(hair_dryer_new['verified_purchase'])
# 成功
hair_dryer_new['vine'] = coding(hair_dryer_new['vine'], {'Y': 1, 'N': 0})
print(hair_dryer_new['vine'])
"""
"""
# 自变量数据预处理,有序特征的映射
mapping_dict = {
"star_rating": {
"5": 1,
"4": 1,
"3": 1,
"2": 0,
"1": 0,
}
}
hair_dryer_new = hair_dryer_new.replace(mapping_dict)
# TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'
print(hair_dryer_new['star_rating'])
"""
"""
# pd.value_counts(hair_dryer_new["star_rating"])
# 删除无关信息
# data = hair_dryer_new.drop(['initial_list_status', ], axis=1)
# review_date review_id customer_id product_id product_parent product_title marketplace product_category
# review_headline review_body weekday dailyreview_count month
# data_x = hair_dryer_new(['neg', 'neu', 'pos', 'compound', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase'], axis=1)
# neg,neu,pos,compound,helpful_votes,total_votes,vine,verified_purchase
data_x = hair_dryer_new.drop(['star_rating', 'star', 'review_date', 'review_id', 'customer_id', 'product_id',
'product_parent', 'product_title', 'marketplace', 'product_category',
'dailyreview_count', 'month', 'weekday', 'review_date_changed'], axis=1)
# review_headline review_body weekday dailyreview_count month'], axis=1)
# print(data_x)
col = data_x.select_dtypes(include=['int64', 'float64']).columns
# 我们采用的是标准化的方法,调用scikit-learn模块preprocessing的子模块StandardScaler。
sc = StandardScaler() # 初始化缩放器
data_x[col] = sc.fit_transform(data_x[col]) # 对数据进行标准化
X = data_x[col]
y = hair_dryer_new['star']
print("y", y)
# ########################################################################
n_sample = y.shape[0]
n_pos_sample = y[y == 0].shape[0]
n_neg_sample = y[y == 1].shape[0]
print('样本个数:{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
n_pos_sample / n_sample,
n_neg_sample / n_sample))
print('特征维数:', X.shape[1])
# 处理不平衡数据
sm = SMOTE(random_state=42) # 处理过采样的方法
X, y = sm.fit_sample(X, y)
print('通过SMOTE方法平衡正负样本后')
n_sample = y.shape[0]
n_pos_sample = y[y == 0].shape[0]
n_neg_sample = y[y == 1].shape[0]
print('样本个数:{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
n_pos_sample / n_sample,
n_neg_sample / n_sample))
# 样本个数:11464; 正样本占0.00%; 负样本占8.99% 特征维数: 8
# 通过SMOTE方法平衡正负样本后 样本个数:33495; 正样本占0.00%; 负样本占20.00%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# print("训练集X_train", X_train)
# To avoid overfitting
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.3)
params1 = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 3,
'lambda': 5, 'gamma': 0.0, 'subsample': 0.75, 'colsample_bytree': 0.75, 'min_child_weight': 2, 'eta': 0.1,
'seed': 0, 'nthread': 8, 'n_estimators': 50, 'silent': 1}
xgboost = xgb.XGBClassifier(**params1)
xgboost.fit(X_train, y_train)
y_xgboost_test = xgboost.predict_proba(X_test)[:, 1]
fpr_xgboost, tpr_xgboost, _ = roc_curve(y_test, y_xgboost_test)
auc_xgb = roc_auc_score(y_test, y_xgboost_test)
print("Xgboost AUC:", auc_xgb)
# ##########################################################################################
# 无缺失值,01二分类的XGBoost+LR
xgboost_lr = xgb.XGBClassifier(**params1)
# nthread=4, learning_rate=0.08, n_estimators=50, max_depth=5, gamma=0, subsample=0.9, colsample_bytree=0.5
xgb_enc = OneHotEncoder()
xgb_lr = LogisticRegression(n_jobs=4, C=0.1, penalty='l2')
xgboost_lr.fit(X_train, y_train)
xgb_enc.fit(xgboost_lr.apply(X_train)[:, :])
xgb_lr.fit(xgb_enc.transform(xgboost_lr.apply(X_train_lr)[:, :]), y_train_lr)
# ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
y_xgb_lr_test = xgb_lr.predict_proba(xgb_enc.transform(xgboost_lr.apply(X_test)[:, :]))[:, 1]
fpr_xgb_lr, tpr_xgb_lr, _ = roc_curve(y_test, y_xgb_lr_test)
auc_xgblr = roc_auc_score(y_test, y_xgb_lr_test)
print("Xgboost + LR AUC:", auc_xgblr)
# ##################################################
# 绘制 ROC曲线
plt.title('XGBoost+LR Receiver Operating Characteristic')
plt.plot(fpr_xgb_lr, tpr_xgb_lr, 'b', label='AUC = %0.5f' % auc_xgblr)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.0])
plt.ylim([-0.1, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# ##################################################
# 画ks曲线
plt.plot(fpr_xgb_lr)
plt.plot(tpr_xgb_lr)
plt.plot(tpr_xgb_lr - fpr_xgb_lr)
plt.title('XGBoost K-S')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
XGBoost01_LR_KS = max(tpr_xgb_lr - fpr_xgb_lr)
print("xgboost+LR K-S", XGBoost01_LR_KS)
# ##################################################
# feature importace
plot_importance(xgboost_lr, max_num_features=15)
plt.title('XGBoost+LR Feature Importance')
plt.show()
"""
重要提示!!!!!
本文中的部分代码都已经公开发表在论文中,不建议直接引用,可能重复比例比较大,任何导致学术造假的后果请浏览者自行承担!!!
为确保无风险,请标明引用!!!