最近找工作遇到一个公司给的面试的题,写一个脚本,爬取一个店铺所有商品的淘宝价,累计评论,交易成功数,宝贝收藏人气
因为考虑只是一个简单demo对下载速度并没有要求所有也没有使用ip池,
脚本执行时打印的爬取到数据处理成一个列表
把爬到的数据存入了csv
请求函数代码发出来吧。
因为只用半天以前也没爬过,所以些许有些潦草
# 打开店铺主页获取每个商品的id
def get_goodsId(page):
''''''
url = "https://lp930428.taobao.com/i/asynSearch.htm"
#https://lp930428.taobao.com/search.htm?spm=a1z10.1-c-s.w5002-21751133133.1.6757669drR9RHI&search=y
querystring = {"_ksTS": "1571966776769_186", "mid": "w-21751133153-0", "wid": "21751133153", "path": "/search.htm",
"search": "y", "spm": "a1z10.1-c-s.w5002-21751133133.1.6757669drR9RHI", "orderType": "newOn_desc",
"pageNo": page}
headers = {
'Host': "lp930428.taobao.com",
'Cookie': "t=e3ced4867bbbb33ad8e43a6664f1b440; thw=cn; cookie2=1ae89e1b7eb4f38050c2a8484ca7f9e3; _tb_token_=fe3fdd7e7787e; enc=FCtEbwsd4d4qFaGEEU1%2Bh6zQqkT31zH7ZjTng%2ByjgFV1IKn98igFAnRQ80yP1MGcCWXDBuTJBB%2Fv2BN0HrnsFw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cna=cfrcFZLIz0UCAXxBtKpIjGyJ; v=0; unb=3342625761; uc3=vt3=F8dByucj%2BPu98CEsPjg%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UNN5E%2BCNxCTmeQ%3D%3D&nk2=symzrp14SFvM8w%3D%3D; csg=e158c6a6; lgc=%5Cu5C0F%5Cu9F99cxcgao; cookie17=UNN5E%2BCNxCTmeQ%3D%3D; dnk=%5Cu5C0F%5Cu9F99cxcgao; skt=5acee91bdd655b99; existShop=MTU3MTk4NzgwNA%3D%3D; uc4=id4=0%40UgQxlJz5L5Tdls4BK%2BsdqbH4MLk8&nk4=0%40sVYTNakktI2J71HcZK%2B0xxDC%2BDng; tracknick=%5Cu5C0F%5Cu9F99cxcgao; _cc_=W5iHLLyFfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=o1f; _nk_=%5Cu5C0F%5Cu9F99cxcgao; cookie1=B0OsmmuZ3XRBlfT4RmnPR0sbhg2pIYLBZ6M6p3hMTnE%3D; uc1=cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&pas=0&cookie14=UoTbnKo8UDBHXA%3D%3D&tag=8&lng=zh_CN; mt=ci=0_1; l=dBgSXBauqGXlvNsDBOCwVuI8YdQOAIRAguPRwhApi_5B9_T6fvbOkZkD2E96cjWftzTB4DlUQKv9-etXiDt6Qtk8sxAJvxDc.; isg=BNLSgblL29-ZPicjbaEizx2QI550S9VMW8fzYZwrzgVwr3KphHIYjcTNG0s2304V",
'accept': "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
'x-requested-with': "XMLHttpRequest",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.2 Safari/605.1.15",
'sec-fetch-mode': "cors",
'sec-fetch-site': "same-origin",
'referer': "https://lp930428.taobao.com/search.htm?spm=a1z10.3-c-s.w4002-21751133153.93.1d6d1c25zjRRJk&_ksTS=1571991134154_186&callback=jsonp187&mid=w-21751133153-0&wid=21751133153&path=%2Fsearch.htm&search=y&pageNo=2",
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache",
# 'Postman-Token': "798702b1-342e-4e2a-8176-c2cc057b0351"
}
requests.packages.urllib3.disable_warnings()
response = requests.request("GET", url, headers=headers, params=querystring, verify=False)
res = response.text
# print(res)
json_data = re.findall('itemIds=(.*?)&source', res,re.S) #解析获取到商品ID
# print(json_data)
return json_data
# 根据索取到的商品ID进入到商品详情页
def get_goodsDetails(i):
url = "https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-21751133153.37.66ab1c25jaTgVP&id={}".format(i)
headers = {
'Host': "item.taobao.com",
'Cookie': "t=e3ced4867bbbb33ad8e43a6664f1b440; thw=cn; cookie2=1ae89e1b7eb4f38050c2a8484ca7f9e3; _tb_token_=fe3fdd7e7787e; enc=FCtEbwsd4d4qFaGEEU1%2Bh6zQqkT31zH7ZjTng%2ByjgFV1IKn98igFAnRQ80yP1MGcCWXDBuTJBB%2Fv2BN0HrnsFw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cna=cfrcFZLIz0UCAXxBtKpIjGyJ; v=0; unb=3342625761; uc3=vt3=F8dByucj%2BPu98CEsPjg%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UNN5E%2BCNxCTmeQ%3D%3D&nk2=symzrp14SFvM8w%3D%3D; csg=e158c6a6; lgc=%5Cu5C0F%5Cu9F99cxcgao; cookie17=UNN5E%2BCNxCTmeQ%3D%3D; dnk=%5Cu5C0F%5Cu9F99cxcgao; skt=5acee91bdd655b99; existShop=MTU3MTk4NzgwNA%3D%3D; uc4=id4=0%40UgQxlJz5L5Tdls4BK%2BsdqbH4MLk8&nk4=0%40sVYTNakktI2J71HcZK%2B0xxDC%2BDng; tracknick=%5Cu5C0F%5Cu9F99cxcgao; _cc_=W5iHLLyFfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=o1f; _nk_=%5Cu5C0F%5Cu9F99cxcgao; cookie1=B0OsmmuZ3XRBlfT4RmnPR0sbhg2pIYLBZ6M6p3hMTnE%3D; uc1=cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&pas=0&cookie14=UoTbnKo8UDBHXA%3D%3D&tag=8&lng=zh_CN; mt=ci=0_1; l=dBgSXBauqGXlvNsDBOCwVuI8YdQOAIRAguPRwhApi_5B9_T6fvbOkZkD2E96cjWftzTB4DlUQKv9-etXiDt6Qtk8sxAJvxDc.; isg=BNLSgblL29-ZPicjbaEizx2QI550S9VMW8fzYZwrzgVwr3KphHIYjcTNG0s2304V",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
'sec-fetch-mode': "navigate",
'sec-fetch-user': "?1",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'sec-fetch-site': "same-site",
'referer': "https://lp930428.taobao.com/search.htm?spm=a1z10.1-c-s.0.0.21c2669dXxI29C&search=y",
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache",
'Postman-Token': "7c7dc56c-6a76-425d-8f60-98609da11052"
}
requests.packages.urllib3.disable_warnings()
response = requests.request("GET", url, headers=headers, verify=False)
# print(response)
res = response.text
# print(res)
resp = re.findall('class="tb-main-title".*?data-title="(.*?)"', res)
count = re.findall("sign=(.*?)&", res)
'''获取的是商品名称,和要抓取宝贝人气的链接'''
return resp, count
# 获取累计评论数
def get_comments(i):
url = "https://rate.taobao.com/detailCount.do?_ksTS=1571993971073_117&callback=jsonp118&itemId={}".format(i)
headers = {
'Host': "rate.taobao.com",
'Cookie': "t=e3ced4867bbbb33ad8e43a6664f1b440; thw=cn; cookie2=1ae89e1b7eb4f38050c2a8484ca7f9e3; _tb_token_=fe3fdd7e7787e; enc=FCtEbwsd4d4qFaGEEU1%2Bh6zQqkT31zH7ZjTng%2ByjgFV1IKn98igFAnRQ80yP1MGcCWXDBuTJBB%2Fv2BN0HrnsFw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cna=cfrcFZLIz0UCAXxBtKpIjGyJ; v=0; unb=3342625761; uc3=vt3=F8dByucj%2BPu98CEsPjg%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UNN5E%2BCNxCTmeQ%3D%3D&nk2=symzrp14SFvM8w%3D%3D; csg=e158c6a6; lgc=%5Cu5C0F%5Cu9F99cxcgao; cookie17=UNN5E%2BCNxCTmeQ%3D%3D; dnk=%5Cu5C0F%5Cu9F99cxcgao; skt=5acee91bdd655b99; existShop=MTU3MTk4NzgwNA%3D%3D; uc4=id4=0%40UgQxlJz5L5Tdls4BK%2BsdqbH4MLk8&nk4=0%40sVYTNakktI2J71HcZK%2B0xxDC%2BDng; tracknick=%5Cu5C0F%5Cu9F99cxcgao; _cc_=W5iHLLyFfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=o1f; _nk_=%5Cu5C0F%5Cu9F99cxcgao; cookie1=B0OsmmuZ3XRBlfT4RmnPR0sbhg2pIYLBZ6M6p3hMTnE%3D; uc1=cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&pas=0&cookie14=UoTbnKo8UDBHXA%3D%3D&tag=8&lng=zh_CN; mt=ci=0_1; l=dBgSXBauqGXlvNsDBOCwVuI8YdQOAIRAguPRwhApi_5B9_T6fvbOkZkD2E96cjWftzTB4DlUQKv9-etXiDt6Qtk8sxAJvxDc.; isg=BNLSgblL29-ZPicjbaEizx2QI550S9VMW8fzYZwrzgVwr3KphHIYjcTNG0s2304V",
'sec-fetch-mode': "no-cors",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
'accept': "*/*",
'sec-fetch-site': "same-site",
'referer': "https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-21751133153.48.10971c25BMetQJ&id={}".format(i),
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache"
# 'Postman-Token': "0e7548b8-43dd-46e0-9467-a1310ebbc0d9"
}
requests.packages.urllib3.disable_warnings()
response = requests.request("GET", url, headers=headers, verify=False)
res = response.text
count = re.findall('count.*?(\d+)', res)
return count
# 获取淘宝价以及交易成功
def get_pice(i):
url = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm"
querystring = {"itemId": i, "sellerId": "1091756444",
"modules": "dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract"}
headers = {
'Sec-Fetch-Mode': "no-cors",
'Cookie': "t=e3ced4867bbbb33ad8e43a6664f1b440; thw=cn; cookie2=1ae89e1b7eb4f38050c2a8484ca7f9e3; _tb_token_=fe3fdd7e7787e; enc=FCtEbwsd4d4qFaGEEU1%2Bh6zQqkT31zH7ZjTng%2ByjgFV1IKn98igFAnRQ80yP1MGcCWXDBuTJBB%2Fv2BN0HrnsFw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cna=cfrcFZLIz0UCAXxBtKpIjGyJ; v=0; unb=3342625761; uc3=vt3=F8dByucj%2BPu98CEsPjg%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UNN5E%2BCNxCTmeQ%3D%3D&nk2=symzrp14SFvM8w%3D%3D; csg=e158c6a6; lgc=%5Cu5C0F%5Cu9F99cxcgao; cookie17=UNN5E%2BCNxCTmeQ%3D%3D; dnk=%5Cu5C0F%5Cu9F99cxcgao; skt=5acee91bdd655b99; existShop=MTU3MTk4NzgwNA%3D%3D; uc4=id4=0%40UgQxlJz5L5Tdls4BK%2BsdqbH4MLk8&nk4=0%40sVYTNakktI2J71HcZK%2B0xxDC%2BDng; tracknick=%5Cu5C0F%5Cu9F99cxcgao; _cc_=W5iHLLyFfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=o1f; _nk_=%5Cu5C0F%5Cu9F99cxcgao; cookie1=B0OsmmuZ3XRBlfT4RmnPR0sbhg2pIYLBZ6M6p3hMTnE%3D; uc1=cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&pas=0&cookie14=UoTbnKo8UDBHXA%3D%3D&tag=8&lng=zh_CN; mt=ci=0_1; l=dBgSXBauqGXlvNsDBOCwVuI8YdQOAIRAguPRwhApi_5B9_T6fvbOkZkD2E96cjWftzTB4DlUQKv9-etXiDt6Qtk8sxAJvxDc.; isg=BNLSgblL29-ZPicjbaEizx2QI550S9VMW8fzYZwrzgVwr3KphHIYjcTNG0s2304V",
'accept': "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
'Referer': "https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-21751133153.33.7ffe1c25zHNgcL&id={}".format(i),
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
'cache-control': "no-cache"
}
requests.packages.urllib3.disable_warnings()
response = requests.request("GET", url, headers=headers, params=querystring, verify=False)
res = response.text
# print(res)
datadict = json.loads(res)
# print(str)
'''现获取交易成功数'''
trading = datadict['data']['soldQuantity']['confirmGoodsCount'] # 交易成功的数
pice = datadict['data']['promotion']['promoData']['def'][0]['price']
return trading, pice
# 宝贝收藏
def get_collection(i,sing):
url = "https://count.taobao.com/counter3?_ksTS=1571993971066_103&callback=jsonp104&inc=ICVT_7_{}&sign={}&keys=DFX_200_1_{},ICVT_7_{},ICCP_1_{},SCCP_2_101463431".format(i, sing, i, i, i)
headers = {
'Host': "count.taobao.com",
'Cookie': "t=e3ced4867bbbb33ad8e43a6664f1b440; thw=cn; cookie2=1ae89e1b7eb4f38050c2a8484ca7f9e3; _tb_token_=fe3fdd7e7787e; enc=FCtEbwsd4d4qFaGEEU1%2Bh6zQqkT31zH7ZjTng%2ByjgFV1IKn98igFAnRQ80yP1MGcCWXDBuTJBB%2Fv2BN0HrnsFw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cna=cfrcFZLIz0UCAXxBtKpIjGyJ; v=0; unb=3342625761; uc3=vt3=F8dByucj%2BPu98CEsPjg%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UNN5E%2BCNxCTmeQ%3D%3D&nk2=symzrp14SFvM8w%3D%3D; csg=e158c6a6; lgc=%5Cu5C0F%5Cu9F99cxcgao; cookie17=UNN5E%2BCNxCTmeQ%3D%3D; dnk=%5Cu5C0F%5Cu9F99cxcgao; skt=5acee91bdd655b99; existShop=MTU3MTk4NzgwNA%3D%3D; uc4=id4=0%40UgQxlJz5L5Tdls4BK%2BsdqbH4MLk8&nk4=0%40sVYTNakktI2J71HcZK%2B0xxDC%2BDng; tracknick=%5Cu5C0F%5Cu9F99cxcgao; _cc_=W5iHLLyFfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=o1f; _nk_=%5Cu5C0F%5Cu9F99cxcgao; cookie1=B0OsmmuZ3XRBlfT4RmnPR0sbhg2pIYLBZ6M6p3hMTnE%3D; uc1=cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&pas=0&cookie14=UoTbnKo8UDBHXA%3D%3D&tag=8&lng=zh_CN; mt=ci=0_1; _m_h5_tk=f1425a910c31b9877a2b467419fa4bfc_1572002857779; _m_h5_tk_enc=25325b15bf3ab20673732ed53229f569; l=dBx1kOlIq1-Yna4UBOCidweULX7OSIRAguWK8C09i_5CW6Ls_U7OkZlM8Fp6VjWfGeTB4DlUQKv9-etXmxeT62k8sxAJwxDc.; isg=BJiYNvaHccm3pl2NmLBHZTCVacCDfP38DcXpT9KJ5FOGbThXepHMm65LoeV4_bTj",
'sec-fetch-mode': "no-cors",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
'accept': "*/*",
'sec-fetch-site': "same-site",
'referer': "https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-21751133153.39.56a81c25HPoZfb&id={}".format(i),
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache",
'Postman-Token': "1420a7da-b2b9-4249-829b-edddeb105432"
}
requests.packages.urllib3.disable_warnings()
response = requests.request("GET", url, headers=headers, verify=False)
res = response.text
count = re.findall('\((.*?)\)', res)[0]
resp = eval(count)["ICCP_1_{}".format(i)]
return resp
潦草是潦草了些,希望对大家有所帮助