自己平日喜欢读书,因此做了一个图书的目录,记录了自己看过的图书的名录如图:
这是一个xslx的文件
下面的代码,查询上面的每一本书,并且下载书籍的封面。需要说明的有:
1. 查询书籍的平台是豆瓣读书
2. 书籍中文名直接嵌入到请求链接时,因为是汉语有浏览器专属的编码问题,因此这里使用了urllib的quote
效果如图:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#--author:XiangguoSun
#--2016.12.10
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.request import urlretrieve
from urllib.parse import quote
from bs4 import BeautifulSoup
import re
def getTitle(url):
try:
html=urlopen(url)
except HTTPError as e:
return None
try:
bsObj=BeautifulSoup(html.read(),"lxml")
title=bsObj.findAll("img",{"src":re.compile(r"https://img3\.doubanio\.com/lpic/.*")})
except AttributeError as e:
return None
return title
def get_book_picture(bookname):
raw_bookname=[]
raw_bookname.append(bookname)
seed_url = u"https://m.douban.com/search/?query="
book=quote(bookname)
url=seed_url+book+u"&type=book"
print(url)
titlelist= getTitle(url)
img_url = titlelist[0]["src"]
urlretrieve(img_url, 'E:/books/'+'%s'%raw_bookname[0]+'.jpg')
print(raw_bookname[0]," save done!")
import pandas as pd
data= pd.read_excel("./books.xlsx")
import time
for bookname in data["bookname"]:
print("start to search book: ",bookname)
try:
get_book_picture(bookname)
time.sleep(5)
except:
with open("./photos.txt",'a') as f:
f.write(bookname+'\n')
time.sleep(5)