coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import os
import pymongo
import json
import pandas as pd
import numpy as np
import xlrd
import datetime
from pyecharts import Line,Grid,EffectScatter,Overlap
def getPriceSoup_table(spiderDay):
soup_table=BeautifulSoup('',"lxml")
for m in range(1,4):
url = "http://www.hfzgncp.com.cn/index.php?m=content&c=index&a=lists&catid=59&sendtime="+str(spiderDay)+"&page="+str(m)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +
'Gecko/20100101 Firefox/61.0'}
r = requests.get(url,headers=headers)
html = r.text.encode(encoding='utf_8').decode()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class' :{'h_list_table r_list_table'}})
soup_table.append(table)
if soup_table.find('td') == None:
print('!!!!!!!!!!!!!!!!!!!!!!'+str(spiderDay)+'这一天没有水产品价格数据!!!!!!!!!!!!!!!!!!!!!!')
return
print(str(spiderDay)+'这一天有水产品价格数据')
return soup_table
def soup_table2DataFrame(soup_table): #构造dataframe 准备存储表格
#收集表头
columns = [x.text for x in soup_table.tr.findAll('th')]
columns = [x.replace('\xa0','') for x in columns]
#print('水产品表头列表:'+str(columns))
#表的宽度
width = len(columns)
print('水产品表头数目:'+str(width))
#height = len(soup_table.findAll(lambda tag:tag.name=='tr' and len(tag.findAll('td'))>=1))
#print('height:'+height)
rows=[]
for row in soup_table.findAll('tr'):
if row not in rows and row.find('td')!=None:
rows.append(row)
height = len(rows) #水产品种类数目
if height <=0:
return
print('水产品种类数目:'+str(height))
for i in range(height):
cells = rows[i].findAll('td')
#print(cells)
df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)
#逐行分析表格
for i in range(height):
cells = rows[i].findAll('td')
print(cells)
if len(cells) == width:
df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells] #去点空格和换行
else:
w=len(cells)
df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]
return df
def onedayPriceSpider(spiderDay):
#查看表格数据行数
#height = len(table.findAll(lambda tag:tag.name=='tr' and len(tag.findAll('td'))>=1))
#rows = [row for row in table.findAll('tr') if row.find('td')!=None]
#print('rows:'+str(len(rows)))
#sendtime = soup.find('input', attrs={'id' :{'sendtime'}})['value'].rstrip('/-') #获取数据时间
#sendtimeStr=re.sub("\-","",sendtime)
def getPriceLine(product_name,days):
price_data = []
price_date = []
for i in range(days):
spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
spiderDayStr =str(spiderDay) #2018-07-11格式
sendtimeStr=re.sub("-","",spiderDayStr) #20180711格式
#outputfilePath="D:/xlsx/"+sendtimeStr+".水产品价格xlsx"
if os.path.exists(outputfilePath):
ExcelFile = xlrd.open_workbook(outputfilePath)
sheet = ExcelFile.sheet_by_index(0)
columnIndex = None
rowIndex = None
for j in range(sheet.ncols):
for i in range(sheet.nrows):
if sheet.cell_value(i, j) == '平均价':
columnIndex = j
break
if sheet.cell_value(i, j) == product_name:
rowIndex = i
break
if not (rowIndex == None) and not (columnIndex == None):
print(sheet.cell_value(rowIndex, columnIndex))
price_data.append(sheet.cell_value(rowIndex, columnIndex))
price_date.append(sendtimeStr)
print(price_data)
print(price_date)
attr = price_date[::-1]
v1 = price_data[::-1]
line = Line(product_name+'价格走势图')
line.add(product_name,attr,v1,is_smooth = True,mark_point = ['max','min'],mark_line=["average"],yaxis_formatter="元")
grid =Grid()
grid.add(line,grid_top="10%")
判断是否存在目标文件夹
isExists=os.path.exists('D:/价格走势图')
if not isExists:
如果不存在则创建目录
os.makedirs('D:/价格走势图')
print ('创建D:/价格走势图文件夹成功')
#return True
else:
# 如果目录存在则不创建,并提示目录已存在
print ('目录已存在')
grid.render('D:/价格走势图/'+product_name+str(days)+'天价格走势图.html')
print('已得到'+product_name+'价格走势图')
es = EffectScatter()
es.add('',attr,v1,effect_scale=8) #闪烁
overlop = Overlap()
overlop.add(line) #必须先添加line,在添加es
overlop.add(es)
overlop.render('./line-es01.html')
return
#client= pymongo.MongoClient()
#获取一个数据库
#db=client.priceSpider
#创建 或获取一个集合,并在collection下新建books
#account=db.prcie
#data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")
#table=data.sheets()[0]
#读取excel第一行数据作为存入mongodb的字段名
#rowstag=table.row_values(0)
#nrows=table.nrows
#print('-------------nrows----------------'+str(nrows))
#ncols=table.ncols #print rows
#returnData={}
#for i in range(1,nrows):
#将字段名和excel数据存储为字典形式,并转换为json格式
#returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))
#通过编解码还原数据
#returnData[i]=json.loads(returnData[i])
#print returnData[i]
#account.insert(returnData[i])
#return daySpider
if name=="main":
spiderDaynumber = 60
for i in range(spiderDaynumber):
spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
spiderDayStr =str(spiderDay) #2018-07-11格式
sendtimeStr=re.sub("\-","",spiderDayStr) #20180711格式
soup_table = getPriceSoup_table(spiderDay)
if not soup_table == None:
df = soup_table2DataFrame(soup_table)
isExists=os.path.exists('D:/xlsx')
if not isExists:
# 如果不存在则创建目录
os.makedirs('D:/xlsx')
print ('创建D:/xlsx文件夹成功')
#return True
else:
outputfilePath="D:/xlsx/"+sendtimeStr+"水产品价格.xlsx"
df.to_excel(outputfilePath)
print('-------------------------------')
getPriceLine('带鱼(大)',59)