2018-07-16-北京新发地市场爬虫

coding=utf-8

import requests
from bs4 import BeautifulSoup
import re
import os
import pymongo
import json
import pandas as pd
import numpy as np
import xlrd
import datetime
from pyecharts import Line,Grid,EffectScatter,Overlap

def getPriceSoup_table():
soup_table=BeautifulSoup('',"lxml")
for i in range(500):
url = "http://www.xinfadi.com.cn/marketanalysis/4/list/"+str(i)+".shtml"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +
'Gecko/20100101 Firefox/61.0'}
r = requests.get(url,headers=headers)
html = r.text.encode(encoding='utf_8').decode()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class' :{'hq_table'}})
soup_table.append(table)
return soup_table

def soup_table2DataFrame(table): #构造dataframe 准备存储表格
#收集表头
columns = [x.text for x in table.tr.findAll('td')]
columns = [x.replace('\xa0','') for x in columns]
print(columns)
rows=[]
for row in table.findAll('tr'):
if row not in rows and row.find('td')!=None:
rows.append(row)
print(rows)
height = len(rows)-1
width = len(columns)
#逐行分析表格
df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)
#逐行分析表格
for i in range(height):
cells = rows[i+1].findAll('td')
if len(cells) == width:
df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells] #去点空格和换行
else:
w=len(cells)
df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]
return df

def getPriceLine(product_name,specification,days):
price_data = []
price_date = []
for i in range(days):
spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
spiderDayStr =str(spiderDay) #2018-07-11格式
sendtimeStr=re.sub("-","",spiderDayStr) #20180711格式
#outputfilePath="D:/xlsx/"+sendtimeStr+".水产品价格xlsx"
if os.path.exists(outputfilePath):
ExcelFile = xlrd.open_workbook(outputfilePath)
sheet = ExcelFile.sheet_by_index(0)
columnIndex = None
rowIndex = None
for j in range(sheet.ncols):
for i in range(sheet.nrows):
if sheet.cell_value(i, j) == '平均价':
columnIndex = j
break
if sheet.cell_value(i, j) == product_name:
if sheet.cell_value(i, j+6) == spiderDayStr:
if sheet.cell_value(i, j+4) == specification:
rowIndex = i
break
print('-------------columnIndex----------------'+str(columnIndex))
print('-------------rowIndex----------------'+str(rowIndex))
if not (rowIndex == None) and not (columnIndex == None):
print(sheet.cell_value(rowIndex, columnIndex))
price_data.append(sheet.cell_value(rowIndex, columnIndex))
price_date.append(sendtimeStr)
print(price_data)
print(price_date)
attr = price_date[::-1]
v1 = price_data[::-1]
line = Line(product_name+str(days)+'天'+'价格走势图')
line.add(product_name,attr,v1,is_smooth = False,mark_point = ['max','min'],mark_line=["average"],yaxis_formatter="元/斤")
grid =Grid()
grid.add(line,grid_top="10%")

判断是否存在目标文件夹

isExists=os.path.exists('D:/价格走势图')
if not isExists:

如果不存在则创建目录

    os.makedirs('D:/价格走势图') 

    print ('创建D:/价格走势图文件夹成功')
#return True
else:
    # 如果目录存在则不创建,并提示目录已存在

print ('目录已存在')

    grid.render('D:/价格走势图/北京新发地市场'+product_name+str(days)+'天价格走势图.html')
    print('已得到'+str(days)+'天'+product_name+'价格走势图')

es = EffectScatter()

es.add('',attr,v1,effect_scale=8) #闪烁

overlop = Overlap()

overlop.add(line) #必须先添加line,在添加es

overlop.add(es)

overlop.render('./line-es01.html')

return  
    
#client= pymongo.MongoClient()
#获取一个数据库   
#db=client.priceSpider
#创建 或获取一个集合,并在collection下新建books
#account=db.prcie
#data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")
#table=data.sheets()[0]
#读取excel第一行数据作为存入mongodb的字段名
#rowstag=table.row_values(0)
#nrows=table.nrows
#print('-------------nrows----------------'+str(nrows))
#ncols=table.ncols #print rows
#returnData={}
#for i in range(1,nrows):
    #将字段名和excel数据存储为字典形式,并转换为json格式
    #returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))
    #通过编解码还原数据
    #returnData[i]=json.loads(returnData[i])
    #print returnData[i]
    #account.insert(returnData[i])
    #return daySpider

if name=="main":
table = getPriceSoup_table()
print(table)
print('---------------------------------------------')
if not table == None:
df = soup_table2DataFrame(table)
isExists=os.path.exists('D:/xlsx')
if not isExists:
# 如果不存在则创建目录
os.makedirs('D:/xlsx')
print ('创建D:/xlsx文件夹成功')
#return True
else:
outputfilePath="D:/xlsx/北京新发地市场水产品每日价格行情.xlsx"
df.to_excel(outputfilePath)
print('-------------------------------')

outputfilePath="D:/xlsx/北京新发地市场水产品每日价格行情.xlsx"
getPriceLine('桂鱼','标桂',120)

你可能感兴趣的:(2018-07-16-北京新发地市场爬虫)