今日代码(200623)--回厂日期预测(python + R)

代码笔记,仅供参考


回厂日期预测


前言,对不同客户的下一次返厂时间进行预测,大多数客户的返厂次数不足10次,仅有少量客户返厂次数大于30次。

平均值法预测(python)


# -*- coding: utf-8 -*-

import pymysql
import time
import numpy as np
import pandas as pd


class CarTest:
    def __init__(self):
        self.db = pymysql.connect(host = '127.0.0.1',
                     port = 3306,
                     user = 'root',
                     password = '19970928',
                     database = 'datacup',
                     charset = 'utf8')
        self.cur = self.db.cursor()
        self.avertime = None
        self.datetframe = None
        self.preTime = {}
        self.Accuracy = 0
        
    def getDate(self):
        with open("./data/car_srv_train.csv", 'r') as f:
            data = f.readline()
            data = f.readline()
            print(type(data))
            while (data):
#                data = self.timeDeal(data)
                self.mysqlInteractive(data)
                data = f.readline()
    
    def timePre(self):
        dictTest = {}
        for key, value in self.dataframe:
            if key not in dictTest:
                dictTest[key] = [value]
            else:
                dictTest[key].append(value)
#            print(dictTest[key])
#        print(dictTest)

        for key in dictTest:
            if len(dictTest[key]) > 5:
#                print(dictTest[key])
                sortTemp = dictTest[key]
                sortTemp.sort(reverse = True)
#                print(sortTemp)
#                print(type(sortTemp))
                averTimeInterval = (sortTemp[1] - sortTemp[-1])/(len(dictTest[key])-1)
                divTime = sortTemp[1] - sortTemp[2]
                preTime = (averTimeInterval + divTime)/2
                self.preTime[key] = [preTime]
                self.preTime[key].append(sortTemp[0] - sortTemp[1])
                self.preTime[key].append(sortTemp[1])
                self.preTime[key].append(sortTemp[0])
                #1:预测间隔时间,2:真实间隔时间,3:倒数第二天, 4:倒数第一天
                
    def errorTest(self):
        #设置预测时间在3个工作日内为预测正确
        trueNum = 0
        countNum = 0
        preDictTemp = self.preTime
        for key in preDictTemp: #value为引用数据类型,可能不用key,value接收循环
            countNum +=1
            
#            print(preDictTemp[key][1]-preDictTemp[key][0]) 
#            print(type(preDictTemp[key][1]-preDictTemp[key][0])) 
            if np.abs((preDictTemp[key][1]-preDictTemp[key][0]).days) < 3:
                trueNum += 1
        self.Accuracy = trueNum/countNum
    
    def mysqlInteractive(self, data):
        dataList = data.split(",")[0:3]
#        print(dataList)
        sqlLine = "insert into CarTest(ordername, username, ordertime) \
                    values(%s, %s, %s);"
        try:
            self.cur.execute(sqlLine, dataList)
            self.db.commit()
            print("提交成功...")    
        except Exception as e:
            self.db.rollback()
            print('错误信息:', e) 
    
    def getMysqData(self):
        sqlLine = "select username, ordertime from CarTest \
                    order by username,ordertime;"      
#        sqlLine2 = """select username, DATEDIFF(max(ordertime), min(ordertime))/count(*) as avertime, count(*) as countnum from 
#                    cartest group by username HAVING COUNT(*)>=3;"""
        self.cur.execute(sqlLine)
        self.dataframe = self.cur.fetchall()
#        self.db.commit()
#        self.cur.execute(sqlLine2)
#        self.avertime = self.cur.fetchall()
#        self.db.commit()

    def main(self):
#        self.getDate()
        self.getMysqData()
        self.timePre()
        self.errorTest()
        self.cur.close()
        self.db.close()

if __name__ == '__main__':
    #获取开始的时间戳
    start = time.time()
    dataOutLine = CarTest()
    dataOutLine.main()
    print("准确率为:", dataOutLine.Accuracy)
    #获取结束的时间戳
    end = time.time()
    print('执行时间:%.2f' % (end-start))

输出:

准确率为: 0.016651248843663275
执行时间:23.67

好吧,准确率低的可以。


时间序列预测(R语言)


library(forecast)


#读取数据


getwd()
setwd("C:/Users/goatbishop/Desktop/data")
car_srv_train <- read.csv("car_srv_train.csv", header = T, stringsAsFactors = F)

#简单查看数据
head(car_srv_train)
dim(car_srv_train)


str(car_srv_train)
car_srv_train$ORDERDATE <- as.Date(car_srv_train$ORDERDATE)

test0624car <- car_srv_train
head(diff(test0624car$ORDERDATE))

test0624car <- test0624car[order(test0624car$CUST_ID, test0624car$ORDERDATE, decreasing = F), ]
#head(test0624car[,c(1:4)])


table(test0624car$CUST_ID)
innames <- names(which(table(test0624car$CUST_ID) >= 3))

test0624car3 <- test0624car[which(test0624car$CUST_ID %in% innames), ]
tablenum <- table(test0624car3$CUST_ID)
length(tablenum)
newtest0624car <- test0624car3[-cumsum(tablenum), ]
dim(test0624car3)
dim(newtest0624car)

test0624diff <- tapply(test0624car3$ORDERDATE, test0624car3$CUST_ID, diff)
dfg <- unlist(test0624diff)
dftest <- cbind(newtest0624car, dfg)



head(dftest[, c(1:3, 13)], 10)
write.csv(dftest, "dftest.csv")

#####时间序列#####

innames2 <- names(which(table(test0624car$CUST_ID) >= 30))

Pre <- c()
real <- c()
for (item in c(1:length(innames2))) {
  tempdf <- dftest$dfg[which(dftest$CUST_ID == innames2[item])]
  temparima <- auto.arima(tempdf[-length(tempdf)])
  
  preout <-forecast(temparima, 1)
  real <- c(real, tempdf[length(tempdf)])
  Pre <- c(Pre, preout$mean)
  
}

errorPre <- real - Pre
write.csv(Pre, "Pre.csv")

###精确度###

a1 = 0
b1 = 0
for (item in c(1:length(errorPre))) {
  b1 = b1 + 1
  if (abs(errorPre[item]) < 10) {
    a1 = a1 + 1
  }
}

(accer <- a1/b1)

write.csv(errorPre, "errorPre.csv")
write.csv(accer, "accer.csv")

which(abs(errorPre) < 2)

tempdf <- dftest$dfg[which(dftest$CUST_ID == innames2[4])]
plot(tempdf, type = 'o', main = "时序图")

temparima <- auto.arima(tempdf[-length(tempdf)])
pretest <- forecast(temparima)

acf(pretest$residuals)
write.csv(pretest$residuals, "pretest_residuals.csv")

你可能感兴趣的:(python,R语言,数据挖掘,python,r)