这是一个python爬虫求助帖

各位大佬好,这里是一个新人(对于python或这个软件或爬虫来说都是)。不知道上哪发帖子,大家的博客好像都是传道授业解惑的,我拿来问问题好像不太ok…总之请多包涵!我正在学中国mook嵩天老师的python爬虫课程,目前进行到爬取淘宝商品信息实例编写,然后在这里卡了两天…下面是我按照老师视频和另一篇博客写出来的代码:
`import re
import requests

def getHTMLText(url):
try:
headers = {
‘authority’: ‘s.taobao.com’,
‘cache-control’: ‘max-age=0’,
‘upgrade-insecure-requests’: ‘1’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36’,
‘accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9’,
‘sec-fetch-site’: ‘same-origin’,
‘sec-fetch-mode’: ‘navigate’,
‘sec-fetch-user’: ‘?1’,
‘sec-fetch-dest’: ‘document’,
‘referer’: ‘https://www.taobao.com/’,
‘accept-language’: ‘zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7’,
‘cookie’: ‘thw=cn; t=c9b0f4dd77e9e7b2d5f3f452ed1c38fb; cna=wu8mFy9yqEACAWcbGiBk+mIZ; sgcookie=EKZRg%2FgPQj%2F2P1%2Bir5Gkt; uc3=vt3=F8dBxGR2VD2qYYsdBAU%3D&nk2=F5RCZI4eenEj1r0%3D&lg2=Vq8l%2BKCLz3%2F65A%3D%3D&id2=VyySWWIA43khYA%3D%3D; lgc=tb713019711; uc4=id4=0%40VXtdLOWymc9WlbfiwLiEKHboleYx&nk4=0%40FY4JikNwFlRRaaFjct%2BPoCsgNdDnMg%3D%3D; tracknick=tb713019711; cc=UIHiLt3xSw%3D%3D; enc=aEY9DGTiFZ3i7C6EzxZebLUJUX4vGzi27p2UA0YdxTgCTWkykGBO9M7zZIdGmTeNAyfWlgnVCJTJpCiLCH8dMQ%3D%3D; tfstk=cb-RBdNGKjclYM9P7H3cAGITp9ORZbDRt81avvNf37_gferdiG-MX-KqPOFRDxC…; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=88_1; v=0; cookie2=12e9edfbb77ff77add29b22cce7540d6; tb_token=f1b4515e97165; nk=tb713019711; JSESSIONID=03E87055CE31D9326C96A47FB438A59D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; l=eBLWjxEqQ2EGmBpsKOfwlurza77OhIRjDuPzaNbMiT5PO9fHkWVdWZjAlTTMCnhNn65BR35uw33TBzTTeyUBC__Ox6LCgsDdGdTh.; isg=BJaWPmgr1MFN8-BlMdTw92Uo50yYN9px1rjdwgD_7HklwzddaMWggU65Wlcvt9KJ; uc1=cookie14=UoTUPcllIm9qkw%3D%3D’,
}
r = requests.get(url,headers = headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return “”

def parsePage(ilt,html):
try:
plt = re.findall(r’“view_price”:"[\d+.]"’,html)
tlt = re.findall(r’“raw_title”:".
?"’,html)
for i in range(len(plt)):
price = eval(plt[i].split(’:’)[1])
title = eval(tlt[i].split(’:’)[1])
ilt.append([price,title])
except:
print("")

def shangpingxingxi(ilt):
tplt = “{:4}\t{:8}\t{:16}”
print(tplt.format(“序号”,“价格”,“商品名称”))
count = 0
for q in ilt:
count = count + 1
print(tplt.format(count,g[0],g[1]))

def main():
goods = “书包”
depth = 2
start_url = “https://s.taobao.com/search?q=”+goods
infoList = []
for i in range(depth):
try:
url = start_url + ‘$S=’ + str(44*i)
html = getHtmlText(url)
parsePage(infoList,html)
except:
continue

shangpingxingxi(infoList)`

main()

不太懂怎么插入代码块,不知道显示出来的是不是正常的代码。总之,这段代码运行后只能出来一个表头。
我尝试复制了那篇帮助了我的博客的代码,运行成功了,但我把商品‘篮球’改成‘书’,再替换了headers就又不行了…以下是我修改后的那篇博客的代码:
`import requests
import re

def getHtmlText(url):
try:
headers = {
‘authority’: ‘s.taobao.com’,
‘cache-control’: ‘max-age=0’,
‘upgrade-insecure-requests’: ‘1’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36’,
‘accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9’,
‘sec-fetch-site’: ‘same-origin’,
‘sec-fetch-mode’: ‘navigate’,
‘sec-fetch-user’: ‘?1’,
‘sec-fetch-dest’: ‘document’,
‘referer’: ‘https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&commend=all&ssid=s5-e&search_type=mall&sourceId=tb.index&area=c2c&spm=a1z02.1.6856637.d4910789’,
‘accept-language’: ‘zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7’,
‘cookie’: ‘thw=cn; t=c9b0f4dd77e9e7b2d5f3f452ed1c38fb; cna=wu8mFy9yqEACAWcbGiBk+mIZ; lgc=tb713019711; tracknick=tb713019711; enc=aEY9DGTiFZ3i7C6EzxZebLUJUX4vGzi27p2UA0YdxTgCTWkykGBO9M7zZIdGmTeNAyfWlgnVCJTJpCiLCH8dMQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=88_1; cookie2=1d1828007b808b66a3afd511314a0609; tb_token=f31365d8fd731; v=0; nk=tb713019711; samesite_flag=true; unb=4037110879; cookie17=VyySWWIA43khYA%3D%3D; dnk=tb713019711; l_g=Ug%3D%3D; sg=194; cookie1=AnWZm9oxxM6trJaNcIYDcwAUVF1NAMr9BTd1IrLfomg%3D; sgcookie=EeTqNZZwCBD4SKRPLUrAW; uc3=vt3=F8dBxGR1SJYcU3gqhco%3D&nk2=F5RCZI4eenEj1r0%3D&id2=VyySWWIA43khYA%3D%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; csg=9b819fe4; skt=697fbc11a7dcbaed; existShop=MTU4NzYyNDcxOQ%3D%3D; uc4=id4=0%40VXtdLOWymc9WlbfiwLiEK4gqwYpZ&nk4=0%40FY4JikNwFlRRaaFjct%2BPoCgntjGHtg%3D%3D; cc=VFC%2FuZ9ajQ%3D%3D; tfstk=cL71BKar0AD1s1bZ8tNebWsoVf8VZ9HBtl9F1MB7IA0YLH51i7uyF80Gmvgp2B1…; alitrackid=i.taobao.com; lastalitrackid=i.taobao.com; JSESSIONID=8B82153D528CE3F9202D6BDF82E95A32; l=eBLWjxEqQ2EGmTSsBO5aFurza779aIRb81PzaNbMiIHca1mV1F6VENQcc9vMRdtj_tCXvetyIQLleRHv5Ozp_fuIyotrCyConxvO.; isg=BFlZcJcqY2mZ3D_86gUHrk4paEUz5k2YkcWVr3sOBAD_gngUwzQyaLuYhE70PeXQ; uc1=cookie14=UoTUPcqeQbqcaQ%3D%3D&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&existShop=false&cookie21=V32FPkk%2FgPzW&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&pas=0’,
}
r = requests.get(url,headers = header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(“爬取失败”)
return “”

def parsePage(ilist,html):
try:
plt = re.findall(r’“view_price”:"\d+.\d*"’,html)
tlt = re.findall(r’“raw_title”:".*?"’,html)
#print(tlt)
print(len(plt))
for i in range(len(plt)):
price = eval(plt[i].split(’"’)[3])
title = tlt[i].split(’"’)[3]
ilist.append([title,price])
#print(ilist)
except:
print(“解析出错”)

def printGoodsList(ilist,num):
print("=")
tplt = “{0:❤️}\t{1:<30}\t{2:>6}”
print(tplt.format(“序号”,“商品名称”,“价格”))
count = 0
for g in ilist:
count += 1
if count <= num:
print(tplt.format(count,g[0],g[1]))
print("
=")

def main():
goods = “书”
depth = 1
start_url = “https://s.taobao.com/search?q=”+goods
infoList = []
num = 20
for i in range(depth):
try:
url = start_url + ‘$S=’ + str(44*i)
html = getHtmlText(url)
parsePage(infoList,html)
except:
continue

printGoodsList(infoList,num)

main()`

思来想去我开始怀疑我的headers有问题了…因为我真的检查了非常多遍我的语法和拼写,应该是没有问题的。
有哪位大佬可以看看哪里出了问题?万分感谢!

你可能感兴趣的:(提问)