python爬虫读取pdf_python爬取网页转换为PDF文件

"

with open("android_training_3.html",'a') as f:

f.write(htmls)

对上面获取的网址分析,获取正文,并将图片取出存于本地;涉及到的是查找标签和修改属性

#网页操作,获取正文及图片

def get_htmls(urls,title):

for i in range(len(urls)):

response=requests.get(urls[i],proxies=proxies)

soup=beautifulsoup(response.content,"html.parser")

htmls="

"+str(i)+"."+title[i]+"

"

tag=soup.find(class_='jd-descr')

#为image添加相对路径,并下载图片

for img in tag.find_all('img'):

im = requests.get(img['src'], proxies=proxies)

filename = os.path.split(img['src'])[1]

with open('image/' + filename, 'wb') as f:

f.write(im.content)

img['src']='image/'+filename

htmls=htmls+str(tag)

with open("android_training_3.html",'a') as f:

f.write(htmls)

print(" (%s) [%s] download end"%(i,title[i]))

htmls="

你可能感兴趣的:(python爬虫读取pdf)