import os
import re
import time
import random
import logging
import pathlib
import requests
from lxml import etree
from pymongo import MongoClient
# 根据需要手动修改要爬取的店铺ID,并设置cookie
shopId = 6
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; tk_trace=1; dnk=pengjun%5Cu674E; uc1=cookie14=UoTaEcMIdvG%2F2Q%3D%3D&lng=zh_CN&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&existShop=false&cookie21=URm48syIYn73&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; uc3=vt3=F8dByuK6XCe%2FyobG9RM%3D&nk2=E6EQ1CLKS%2FnL&id2=VWeT3jqq6jDz&lg2=W5iHLLyFOGW7aA%3D%3D; tracknick=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; uc4=id4=0%40V8Zo2exYFQXrRTZfa2A8fWgCJ%2B0%3D&nk4=0%40EbhmhLlrKdq9uf0H4heNaPV%2BwIo%3D; lgc=pengjun%5Cu674E; csg=b9af8b15; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; whl=-1%260%260%260; cq=ccp%3D1; swfstore=171740; _bl_uid=Oekw81wkrjto8X9ddpwz59Lo2byd; pnm_cku822=; _m_h5_tk=b6a19e8985356b467ddaa2fba0d073e9_1571740652530; _m_h5_tk_enc=715e1141f3a7515e95f7ff83e824eca0; isg=BH19CXaG_C6KPlgX7xg1CrZpjNnbCuFDleKNlz_CuVQDdp2oB2rBPEsgIOqVdskk; l=dBjfAG8qqFtnSVAFBOCwourza77OSIRAguPzaNbMi_5B16L1Dn7OkZk0OFp6VjWftt8B4-YhSFe9-etkid-Jth7djawTBxDc.'
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
# 可爬取的店铺配置参数
shop_list = {
1 :{
"shop_name": "purcotton",
"list_url": "https://purcotton.tmall.com/i/asynSearch.htm?callback=jsonp693&mid=w-14440378953-0&wid=14440378953&path=/search.htm&search=y&pageNo={}",
"referer": "https://purcotton.tmall.com/search.htm"
},
2 :{
"shop_name": "miansen",
"list_url": "https://miansen.tmall.com/i/asynSearch.htm?callback=jsonp363&mid=w-16800593356-0&wid=16800593356&path=/search.htm&search=y&pageNo={}",
"referer": "https://miansen.tmall.com/search.htm"
},
3 :{
"shop_name": "zichu",
"list_url": "https://zichu.tmall.com/i/asynSearch.htm?callback=jsonp363&mid=w-14977327192-0&wid=14977327192&path=/search.htm&search=y&pageNo={}",
"referer": "https://zichu.tmall.com/search.htm"
},
4 :{
"shop_name": "babycaremy",
"list_url": "https://babycaremy.tmall.com/i/asynSearch.htm?callback=jsonp125&mid=w-14913709402-0&wid=14913709402&path=/search.htm&search=y&pageNo={}",
"referer": "https://babycaremy.tmall.com/search.htm"
},
5 :{
"shop_name": "jianrou",
"list_url": "https://jianrou.tmall.com/i/asynSearch.htm?callback=jsonp125&mid=w-16603479881-0&wid=16603479881&path=/search.htm&search=y&pageNo={}",
"referer": "https://jianrou.tmall.com/search.htm"
},
6 :{
"shop_name": "qingshenghuorh",
"list_url": "https://qingshenghuorh.tmall.com/i/asynSearch.htm?callback=jsonp116&mid=w-14896201470-0&wid=14896201470&path=/search.htm&search=y&pageNo={}",
"referer": "https://qingshenghuorh.tmall.com/search.htm"
}
}
# 构造请求头
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Referer' : shop_list[shopId]['referer'],
'Cookie' : cookie,
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}
# 设置店铺数据存放目录
now = time.time()
date_str = time.strftime('%Y-%m-%d', time.localtime(now))
shop_dir = "D:\scrapy\double11\{}\{}".format( date_str,shop_list[shopId]['shop_name'])
if os.path.exists(shop_dir) is False:
os.makedirs(shop_dir)
# 请求响应内容需匹配的正则模式
search_pattern = re.compile(r'jsonp\d*\(\"(.*?)\"\)', re.S)
# 连接Mongodb数据库
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']
# 先将店铺数据全部置为不可用,剔除下架商品
db.update_many({"shopId":shopId},{'$set': {"enabled":False}})
goods_list = [] # 用来保存爬到的商品ID
# 用来进行迭代请求,一次最多请求15,防止被反爬
currentPage = 1
hasNext = True
requests_count = 15
# 设置字符集
charSet = "gbk"
# 创建session
s = requests.Session()
while hasNext and requests_count>0:
logger.info("开始爬取第 {} 页".format(currentPage))
pageFilePath = shop_dir + "\page_{}.txt".format(currentPage)
pagePath = pathlib.Path(pageFilePath)
if pagePath.exists():
logger.info("使用本地文件。。。")
with open(pageFilePath, 'r', encoding=charSet) as f:
html_str = f.read()
else:
logger.info("发送网络请求。。。")
url = shop_list[shopId]['list_url'].format(currentPage)
time.sleep(1+random.randint(0,3))
try :
requests_count = requests_count-1
response = s.get(url, headers=headers)
except BaseException as e:
logger.info(e)
break
else:
ret_str = response.content.decode(charSet,"ignore")
# 替换掉响应内容中影响xml解析的内容
format_str = ret_str.replace('=\\\"', '=')
format_str = format_str.replace('\\\" ', ' ')
format_str = format_str.replace('\\\">', '>')
search_ret = search_pattern.search(format_str)
if search_ret:
html_str = search_ret.group(1)
with open(pageFilePath, 'w', encoding=charSet) as f:
f.write(html_str)
if html_str:
# 构造xml树
html = etree.HTML(html_str)
# 美化输出html的内容
# print(etree.tostring(html, pretty_print=True).decode('utf-8'))
# 看看是否还有下一页
next_page = html.xpath('//p[@class="ui-page-s"]/a[@title="下一页"]')
if len(next_page) == 0:
hasNext = False
# 解析页面内容
items = html.xpath('//div[@class="J_TItems"]/div[(@class="item5line1" or @class="item4line1") and position()0 else 0
rate_list = item.xpath('./dd[@class="rates"]/div/h4/a/span/text()')
rate_count_str = rate_list[0].strip().replace("评价: ","") if len(rate_list)>0 else 0
ret = db.find_one({'_id': sku_id})
if ret is None:
o = {}
o['_id'] = sku_id
o['imgUrl'] = img_url
o['title'] = item_name
o['price'] = float(sku_price_str)
o['totalSaleCount'] = int(sale_count_str)
o['rateCount'] = int(rate_count_str)
o['enabled'] = True
o['updateTime'] = int(now)
# 关联的店铺ID
o['shopId'] = shopId
db.insert_one(o)
else:
o = {'$set': {}}
o['$set']['imgUrl'] = img_url
o['$set']['title'] = item_name
o['$set']['price'] = float(sku_price_str)
o['$set']['totalSaleCount'] = int(sale_count_str)
o['$set']['rateCount'] = int(rate_count_str)
o['$set']['enabled'] = True
o['$set']['updateTime'] = int(now)
db.update_one({'_id': sku_id}, o)
goods_list.append(sku_id)
currentPage=currentPage+1
print(goods_list)
# 数据示例,需关注字段类型
"""
{
"_id": 554949386593,
"imgUrl": "//img.alicdn.com/bao/uploaded/i3/430490406/O1CN01MfS3gC1ErzMAMswd5_!!0-item_pic.jpg_180x180.jpg",
"title": "全棉时代擦脸巾洗脸巾女一次性洁面巾棉柔纯棉抽取式实惠盒装纸巾",
"price": 89.9,
"totalSaleCount": 359972,
"rateCount": 68780,
"enabled": true,
"updateTime": 1571987992,
"shopId": 1
}
"""
import os
import re
import json
import time
import random
import logging
import pathlib
import requests
from pymongo import MongoClient
# 根据需要手动修改要爬取的店铺ID,并设置cookie
shopId = 6
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; tk_trace=1; dnk=pengjun%5Cu674E; uc1=cookie14=UoTaEcMIdvG%2F2Q%3D%3D&lng=zh_CN&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&existShop=false&cookie21=URm48syIYn73&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; uc3=vt3=F8dByuK6XCe%2FyobG9RM%3D&nk2=E6EQ1CLKS%2FnL&id2=VWeT3jqq6jDz&lg2=W5iHLLyFOGW7aA%3D%3D; tracknick=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; uc4=id4=0%40V8Zo2exYFQXrRTZfa2A8fWgCJ%2B0%3D&nk4=0%40EbhmhLlrKdq9uf0H4heNaPV%2BwIo%3D; lgc=pengjun%5Cu674E; csg=b9af8b15; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; whl=-1%260%260%260; cq=ccp%3D1; swfstore=171740; _bl_uid=Oekw81wkrjto8X9ddpwz59Lo2byd; pnm_cku822=; _m_h5_tk=b6a19e8985356b467ddaa2fba0d073e9_1571740652530; _m_h5_tk_enc=715e1141f3a7515e95f7ff83e824eca0; l=dBjfAG8qqFtnSY8MBOCNqQKXiCQOSIRAguSJGwSBi_5aX6L6_x7OkZlPoFp6VjWftt8B4-YhSFe9-etkid-Jth7djawTBxDc.; isg=BAwM2l0Hve3W56lEVtfUqc-u3WqSVeDAnEVcZGbNGLda8az7jlWAfwJHkbnsuehH'
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
# 可爬取的店铺配置参数
shop_list = {
1 :{ "shop_name": "purcotton" },
2 :{ "shop_name": "miansen" },
3 :{ "shop_name": "zichu" },
4 :{ "shop_name": "babycare" },
5 :{ "shop_name": "jianrou" },
6 :{ "shop_name": "qingshenghuorh" }
}
# 构造请求头
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Cookie' : cookie,
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}
# 设置店铺数据存放目录
now = time.time()
date_str = time.strftime('%Y-%m-%d', time.localtime(now))
shop_dir = "D:\scrapy\double11\{}\{}".format( date_str,shop_list[shopId]['shop_name'])
if os.path.exists(shop_dir) is False:
os.makedirs(shop_dir)
# 连接Mongodb数据库
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']
# 获取店铺的商品列表
goods_list = db.find({"shopId":shopId})
# 设置字符集
charSet = "gbk"
# 创建session
s = requests.Session()
# 请求响应内容需匹配的正则模式
search_pattern = re.compile(r'setMdskip\s\((.*?)\)$', re.S)
info_url = "https://mdskip.taobao.com/core/initItemDetail.htm?isUseInventoryCenter=false&cartEnable=false&service3C=false&isApparel=false&isSecKill=false&tmallBuySupport=true&isAreaSell=false&tryBeforeBuy=false&offlineShop=false&itemId={}&showShopProm=false&isPurchaseMallPage=false&isRegionLevel=false&household=false&sellerPreview=false&queryMemberRight=true&addressLevel=2&isForbidBuyItem=false&callback=setMdskip×tamp={}"
referer_base= "https://detail.tmall.com/item.htm?id={}"
for goods in goods_list:
goods_id = goods["_id"]
logger.info("开始爬取商品: {}".format(goods_id))
itemFilePath = shop_dir + "\\info_{}.txt".format(goods_id)
pagePath = pathlib.Path(itemFilePath)
if pagePath.exists():
logger.info("使用本地文件。。。")
with open(itemFilePath, 'r', encoding=charSet) as f:
json_str = f.read()
else:
logger.info("发送网络请求。。。")
time.sleep(1 + random.randint(0, 3))
headers['Referer'] = referer_base.format(goods_id)
now = time.time()
url = info_url.format(goods_id, int(round(now * 1000)))
try :
response = s.get(url, headers=headers)
except BaseException as e:
logger.info(e)
break
else:
ret_str = response.content.decode(charSet,"ignore")
search_ret = search_pattern.search(ret_str)
if search_ret:
json_str = search_ret.group(1)
with open(itemFilePath, 'w', encoding=charSet) as f:
f.write(json_str)
if json_str:
# 转换为json对象
json_obj = json.loads(json_str)
# 解析响应数据
if 'sellCount' in json_obj['defaultModel']['sellCountDO']:
monthSaleCount = json_obj['defaultModel']['sellCountDO']['sellCount']
else:
monthSaleCount = '-'
totalQuantity = json_obj['defaultModel']['inventoryDO']['totalQuantity']
price_info= json_obj['defaultModel']['itemPriceResultDO']['priceInfo']
if 'def' in price_info:
def_info = price_info['def']
else:
for key in price_info:
def_info = price_info[key]
if def_info:
# 如果包含预售字段
if 'wrtInfo' in def_info:
wrt_info = def_info['wrtInfo']
original_price = def_info['promotionList'][0]['price']
pre_cash = wrt_info['price']
pre_count = wrt_info['groupUC']
o = { '$set': {} }
o['$set']['monthSaleCount'] = monthSaleCount
o['$set']['originalPrice'] = float(original_price)
o['$set']['preSaleCash'] = int(pre_cash)/100
o['$set']['preSaleCount'] = int(pre_count)
o['$set']['preSale'] = True
o['$set']['preSaleTotal'] = goods["price"]*int(pre_count)
o['$set']['totalQuantity'] = int(totalQuantity)
o['$set']['updateTime'] = int(now)
db.update_one({'_id':int(goods_id)},o)
else:
original_price = def_info['price']
o = { '$set': {} }
o['$set']['monthSaleCount'] = monthSaleCount
o['$set']['originalPrice'] = float(original_price)
o['$set']['preSale'] = False
o['$set']['totalQuantity'] = int(totalQuantity)
o['$set']['updateTime'] = int(now)
db.update_one({'_id': int(goods_id)}, o)
else:
break
# 数据示例,需关注字段类型
"""
{
"_id": 20159694203,
"imgUrl": "//img.alicdn.com/bao/uploaded/i4/430490406/O1CN01BzX29B1ErzMxI791m_!!0-item_pic.jpg_180x180.jpg",
"title": "全棉时代 产妇一次性内裤女士纯棉孕妇产后月子待产用品旅行 25条",
"price": 96,
"totalSaleCount": 631868,
"rateCount": 40631,
"enabled": true,
"updateTime": 1571993200,
"shopId": 1,
"monthSaleCount": "2.5万+",
"originalPrice": 194,
"preSale": true,
"preSaleCash": 10,
"preSaleCount": 26135,
"preSaleTotal": 2508960,
"totalQuantity": 18423
}
"""