读书期间下载了许多的电子书,但是分类很乱。突发奇想,可以用endnote进行管理,但是手动输入书籍信息很麻烦,于是就想爬取豆瓣读书中的书籍信息。
首先用python爬取豆瓣中书籍的基本信息,如书名,作者,摘要等。然后将其保存为RIS文件,最后用endnote读取RIS文件。
实现GUI界面,输入豆瓣中书籍地址,即可爬取书籍信息,并且保存为RIS文件,可以导入各种文献管理软件之中。最后将其打包成exe文件,具体的下载地址如下:https://download.csdn.net/download/stromlord/12552065
爬取书籍信息,并保存为RIS文件:
def html_request(url, encodeing='utf-8', timeout=5, headers=None):
response = requests.get(url, headers=headers, timeout=timeout)
if response.status_code == 200:
html = response.content.decode(encodeing, 'ignore')
else:
html = None
return html
def html_read(url, encodeing='utf-8', timeout=5, headers=None):
read_count = 0
connect_count = 0
html = ''
while True:
try:
html = html_request(url, encodeing, timeout, headers=headers)
break
except requests.exceptions.ReadTimeout:
time.sleep(3)
print("ReadTimeout", end='')
read_count = read_count + 1
if read_count > 10:
break
except requests.exceptions.ConnectionError:
time.sleep(3)
print("ConnectionError", end='')
connect_count = connect_count + 1
if connect_count > 10:
break
return html
def book_info(book_url, ris_dir='', series=None, ris_flag=True):
book_info_dict = {
'书名': None,
'作者': None,
'摘要': None,
'出版社': None,
'副标题': None,
'原作名': None,
'译者': None,
'出版年': None,
'页数': None,
'丛书': None,
'ISBN': None,
'标签': None
}
book_html = html_read(book_url, headers=DouBan_header)
time.sleep(1.5)
book_soup = BeautifulSoup(book_html, "html.parser")
book_title = book_soup.select('title')
title = str(book_title[0]).replace('', '').replace(' ', '').replace(' (豆瓣)', '')
book_info_dict['书名'] = title
book_basic_info = book_soup.select('#info')
book_basic_info = str(book_basic_info)
book_basic_info = book_basic_info.split('
')
for book_item in book_basic_info:
# print(book_item)
if '作者' in book_item:
author_pattern = re.compile('(.*?)')
author_info = author_pattern.findall(book_item.replace('\n', ''))
book_info_dict['作者'] = author_info
elif '译者' in book_item:
translator_pattern = re.compile('(.*?)')
translator_info = translator_pattern.findall(book_item.replace('\n', ''))
book_info_dict['译者'] = translator_info
elif '丛书' in book_item:
series_pattern = re.compile('(.*?)')
series_info = series_pattern.findall(book_item)
book_info_dict['丛书'] = str(series_info).replace("['", '').replace("']", '')
else:
book_item = book_item.replace('\n', '').replace('[', '')
other_info = book_item.split(': ')
if (len(other_info) >= 2) and (other_info[0] in book_info_dict):
content = other_info[1].replace('
', '').replace('
', '').replace('', '').replace(']', '')
book_info_dict[other_info[0]] = content
keyword_info = book_soup.select('.indent span .tag')
keywords = ''
for keyword_a in keyword_info:
keyword_pattern = re.compile('(.*?)')
keyword = keyword_pattern.findall(str(keyword_a))
keywords = keywords + keyword[0] + ', '
book_info_dict['标签'] = keywords
book_intro = book_soup.select('.intro')
if book_intro:
book_abstract = str(book_intro[1]) if 'javascript' in str(book_intro[0]) else str(book_intro[0])
book_abstract = book_abstract.replace('', '').replace('
', ''). \
replace('', '').replace('', '\n')
book_info_dict['摘要'] = book_abstract
for key in book_info_dict:
if book_info_dict[key] is None:
book_info_dict[key] = ''
if ris_flag:
if series is None:
save_dir = ris_dir + book_info_dict['书名'].replace('/', '') + '.ris'
else:
save_dir = ris_dir + series + '.ris'
save_dir = save_dir.replace('\n', '')
with codecs.open(save_dir, mode='a', encoding='utf-8') as f:
f.write("TY - BOOK" + "\n")
if len(book_info_dict['译者']) >= 1:
for i in range(len(book_info_dict['译者'])):
f.write("A4 - " + str(book_info_dict['译者'][i]) + "\n")
if not book_info_dict['摘要'] is None:
f.write("AB - " + book_info_dict['摘要'] + "\n")
if len(book_info_dict['作者']) >= 1:
for i in range(len(book_info_dict['作者'])):
f.write("AU - " + str(book_info_dict['作者'][i]) + "\n")
f.write("DA - " + book_info_dict['出版年'] + "\n")
f.write("PY - " + book_info_dict['出版年'][0:4] + "\n")
f.write("KW - " + book_info_dict['标签'] + "\n")
f.write("PB - " + book_info_dict['出版社'] + "\n")
f.write("SN - " + book_info_dict['ISBN'] + "\n")
f.write("T2 - " + book_info_dict['丛书'] + "\n")
f.write("TI - " + book_info_dict['书名'] + "\n")
f.write("ER - " + "\n")
f.write("\n")
return book_info_dict['书名']
用tkinter制作GUI,实现功能:
class tkURL(object):
def __init__(self):
self.top = Tk()
self.entry = Entry(self.top, width=50)
self.entry.pack()
self.cwd = StringVar(self.top)
self.cwd.set('None')
self.label = Label(self.top, textvariable=self.cwd)
self.label.pack()
self.frame = Frame(self.top)
self.bClip = Button(self.frame, text='Clip', command=self.entryClip,
activeforeground='white', activebackground='red')
self.bUrl = Button(self.frame, text='URL', command=self.getEntry,
activeforeground='white', activebackground='blue')
self.bQuit = Button(self.frame, text='Quit', command=self.top.quit,
activeforeground='white', activebackground='red')
self.bClip.pack(side=LEFT)
self.bUrl.pack(side=LEFT)
self.bQuit.pack(side=RIGHT)
self.frame.pack()
def getEntry(self, ev=None):
url = self.entry.get()
title = book_info(url, ris_dir='')
title = os.getcwd() + title
self.cwd.set(title)
def entryClip(self, ev=None):
self.entry.delete(0, END)
content = pyperclip.paste()
self.entry.insert(0, content)
最后用pyinstaller制作exe。完成!