#encoding=utf-8
import requests
import re
from bs4 import BeautifulSoup
import json
import xlwt
import xlrd
DATA= []
goods = raw_input('请输入您想要进行比价的商品名称(如:果冻包)\n>>>')
#获取第一个页面的所有与python有关的数据信息
# url = 'https://s.taobao.com/search?q=python'
url = 'https://s.taobao.com/search?q={}&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0'.format(goods)
r = requests.get(url, timeout = 30)
#获取html页面
html = r.text
#获取json数据
content = re.findall(r'g_page_config = (.+?)g_srp_loadCss', html, re.S)[0].strip()[:-1]
# 格式化json数据
content = json.loads(content)
#信息列表
dataList = content['mods']['itemlist']['data']['auctions']
#提取数据
for item in dataList:
temp = {
'raw_title': item['raw_title'],
'view_price': item['view_price'],
'view_sales': item['view_sales'],
'view_fee': '否' if float(item['view_fee']) else '是',
'isTmall': '是' if item['shopcard']['isTmall'] else '否',
'view_loc': item['item_loc'],
'name': item['nick'],
'detail_url': item['detail_url']
}
DATA.append(temp)
# print len(DATA)
#cookie保持
cookies = r.cookies
#获取剩余的12条数据
url2 = 'https://s.taobao.com/api?_ksTS=1531540228441_814&callback=jsonp815&ajax=true&m=customized&q={}&ntoffset=9&p4ppushleft=1,48&s=36&bcoffset=-1&rn=fb9f089092fbceed410248ff5e71d997'.format(goods)
r2 = requests.get(url2, cookies=cookies)
#获取html页面
html2 = r2.text
#获取json数据
content = re.findall(r'{.+}', html2)[0]
#格式化json数据成为字典
content = json.loads(content)
#信息列表
dataList = content['API.CustomizedApi']['itemlist']['auctions']
#提取信息
for item in dataList:
temp = {
'raw_title': item['raw_title'],
'view_price': item['view_price'],
'view_sales': item['view_sales'],
'view_fee': '否' if float(item['view_fee']) else '是',
'isTmall': '是' if item['shopcard']['isTmall'] else '否',
'view_loc': item['item_loc'],
'name': item['nick'],
'detail_url': item['detail_url']
}
DATA.append(temp)
print len(DATA)
#cookie保持
cookies = r2.cookies
for i in range(1,2):
url = 'https://s.taobao.com/search?q={}&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s={}'.format(goods, i*44)
r3 = requests.get(url, cookies=cookies)
html3 = r3.text
# 获取json数据
content = re.findall(r'g_page_config = (.+?)g_srp_loadCss', html3, re.S)[0].strip()[:-1]
# 格式化json数据
content = json.loads(content)
# 信息列表
dataList = content['mods']['itemlist']['data']['auctions']
# 提取数据
for item in dataList:
temp = {
'raw_title': item['raw_title'],
'view_price': item['view_price'],
'view_sales': item['view_sales'],
'view_fee': '否' if float(item['view_fee']) else '是',
'isTmall': '是' if item['shopcard']['isTmall'] else '否',
'view_loc': item['item_loc'],
'name': item['nick'],
'detail_url': item['detail_url']
}
DATA.append(temp)
print len(DATA)
# 因为在后续的翻页过程中不存在异步加载的数据信息,所以说直接使用url1获取第一次加载获取得到的html页面,并得到其中的数据内容即可,
# 否则可以使用cookie保持,然后借助于2次到3次加载剩余数据时采用的url的不同,以及之间的联系信息进行url的处理并获取其中的数据信息
# 获取剩余的12条数据
# cookies = r3.cookies
# url2 = 'https://s.taobao.com/api?_ksTS=1531490330846_224&callback=jsonp225&ajax=true&m=customized&sourceId=tb.index&q=python&spm=a21bo.2017.201856-taobao-item.1&s=36&imgfile=&initiative_id=tbindexz_20170306&bcoffset=-1&commend=all&ie=utf8&rn=cfa45b12557fdf04fda5b2f0bff49239&ssid=s5-e&search_type=item'
# r2 = requests.get(url2, cookies=cookies)
# # 获取html页面
# html2 = r2.text
# # print html2
#
# # 获取json数据
# content = re.findall(r'{.+}', html2)[0]
#
# # 格式化json数据成为字典
# content = json.loads(content)
#
# # 信息列表
# dataList = content['API.CustomizedApi']['itemlist']['auctions']
#
# # 提取信息
# for item in dataList:
# temp = {
# 'raw_title': item['raw_title'],
# 'view_price': item['view_price'],
# 'view_sales': item['view_sales'],
# 'view_fee': '否' if float(item['view_fee']) else '是',
# 'isTmall': '是' if item['shopcard']['isTmall'] else '否',
# 'view_loc': item['item_loc'],
# 'name': item['nick'],
# 'detail_url': item['detail_url']
# }
# DATA.append(temp)
print len(DATA)
#画图
#未实现
# 写入表格
#持久化
f = xlwt.Workbook(encoding = 'utf-8')
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = 'SimSun' # 指定“宋体”
style.font = font
worksheet = f.add_sheet('my_firt_xlwt', cell_overwrite_ok=False)
#写标题
worksheet.write(0,0,'标题')
worksheet.write(0,1,'标价')
worksheet.write(0,2,'购买人数')
worksheet.write(0,3,'是否包邮')
worksheet.write(0,4,'是否天猫')
worksheet.write(0,5,'地区')
worksheet.write(0,6,'店名')
worksheet.write(0,7,'url')
#写内容
for i in range(len(DATA)):
worksheet.write(i+1,0,DATA[i]['raw_title'])
worksheet.write(i+1,1,DATA[i]['view_price'])
worksheet.write(i+1,2,DATA[i]['view_sales'])
worksheet.write(i+1,3,DATA[i]['view_fee'])
worksheet.write(i+1,4,DATA[i]['isTmall'])
worksheet.write(i+1,5,DATA[i]['view_loc'])
worksheet.write(i+1,6,DATA[i]['name'])
worksheet.write(i+1,7,DATA[i]['detail_url'])
f.save(u'the result of search {}.xls'.format(goods))
# #从表格中读取数据
'''
文件路径比较重要,要以这种方式去写文件路径不用
'''
file_path = r'd:/python-workspace/python-pachong/taobao-price/the result of search {}.xls'.format(goods)
#读取的文件路径
file_path = file_path.decode('utf-8')
#文件中的中文转码
data = xlrd.open_workbook(file_path)
#获取数据
# table = data.sheet_by_name('my_firt_xlwt')
table = data.sheets()[0]
#获取sheet
nrows = table.nrows
#获取总行数
ncols = table.ncols
#获取总列数
#获取每一行的所有数据
for i in range(nrows):
rows_data = table.row_values(i)
# print rows_data
# print nrows
#获取一列的数值
for i in range(ncols):
cols_data = table.col_values(i)
# print cols_data
# print type(cols_data)
# print ncols
#获取一个单元格的数值
for row in range(1, nrows):
print "%d行的值为: " %row,
for col in range(ncols):
#table为sheet的名字
#用行进行读取某行某列的数据 table.row_values(row_num)[col_num]
# print type(table.row_values(row)[col].encode('utf-8'))
# cell_value = table.row_values(row)[col].value
#用列进行读取某行某列的数据 table.col_values(col_num)[row_num]
# cell_value = table.col_values(col)[row].value
#用单元格进行某行某列数据的读取操作 table.cell(row_num,col_num).value
cell_value = table.cell(row, col).value
if (len(cell_value) > 20):
cell_value = cell_value[:20]+'...'
print cell_value + '\t\t',
print ''