爬取豆瓣书籍信息并且保存为ris文件并导入Endnote

读书期间下载了许多的电子书,但是分类很乱。突发奇想,可以用endnote进行管理,但是手动输入书籍信息很麻烦,于是就想爬取豆瓣读书中的书籍信息。

首先用python爬取豆瓣中书籍的基本信息,如书名,作者,摘要等。然后将其保存为RIS文件,最后用endnote读取RIS文件。

实现GUI界面,输入豆瓣中书籍地址,即可爬取书籍信息,并且保存为RIS文件,可以导入各种文献管理软件之中。最后将其打包成exe文件,具体的下载地址如下:https://download.csdn.net/download/stromlord/12552065

爬取书籍信息,并保存为RIS文件:

def html_request(url, encodeing='utf-8', timeout=5, headers=None):
    response = requests.get(url, headers=headers, timeout=timeout)
    if response.status_code == 200:
        html = response.content.decode(encodeing, 'ignore')
    else:
        html = None
    return html


def html_read(url, encodeing='utf-8', timeout=5, headers=None):
    read_count = 0
    connect_count = 0
    html = ''
    while True:
        try:
            html = html_request(url, encodeing, timeout, headers=headers)
            break
        except requests.exceptions.ReadTimeout:
            time.sleep(3)
            print("ReadTimeout", end='')
            read_count = read_count + 1
            if read_count > 10:
                break
        except requests.exceptions.ConnectionError:
            time.sleep(3)
            print("ConnectionError", end='')
            connect_count = connect_count + 1
            if connect_count > 10:
                break
    return html


def book_info(book_url, ris_dir='', series=None, ris_flag=True):
    book_info_dict = {
        '书名': None,
        '作者': None,
        '摘要': None,
        '出版社': None,
        '副标题': None,
        '原作名': None,
        '译者': None,
        '出版年': None,
        '页数': None,
        '丛书': None,
        'ISBN': None,
        '标签': None
    }

    book_html = html_read(book_url, headers=DouBan_header)
    time.sleep(1.5)
    book_soup = BeautifulSoup(book_html, "html.parser")

    book_title = book_soup.select('title')
    title = str(book_title[0]).replace('', '').replace('', '').replace(' (豆瓣)', '')
    book_info_dict['书名'] = title

    book_basic_info = book_soup.select('#info')
    book_basic_info = str(book_basic_info)
    book_basic_info = book_basic_info.split('
') for book_item in book_basic_info: # print(book_item) if '作者' in book_item: author_pattern = re.compile('(.*?)') author_info = author_pattern.findall(book_item.replace('\n', '')) book_info_dict['作者'] = author_info elif '译者' in book_item: translator_pattern = re.compile('(.*?)') translator_info = translator_pattern.findall(book_item.replace('\n', '')) book_info_dict['译者'] = translator_info elif '丛书' in book_item: series_pattern = re.compile('(.*?)') series_info = series_pattern.findall(book_item) book_info_dict['丛书'] = str(series_info).replace("['", '').replace("']", '') else: book_item = book_item.replace('\n', '').replace('[
', '') other_info = book_item.split(': ') if (len(other_info) >= 2) and (other_info[0] in book_info_dict): content = other_info[1].replace('
', '').replace('
', '').replace('
', '').replace(']', '') book_info_dict[other_info[0]] = content keyword_info = book_soup.select('.indent span .tag') keywords = '' for keyword_a in keyword_info: keyword_pattern = re.compile('(.*?)') keyword = keyword_pattern.findall(str(keyword_a)) keywords = keywords + keyword[0] + ', ' book_info_dict['标签'] = keywords book_intro = book_soup.select('.intro') if book_intro: book_abstract = str(book_intro[1]) if 'javascript' in str(book_intro[0]) else str(book_intro[0]) book_abstract = book_abstract.replace('

', '').replace('

', ''). \ replace('
', '').replace('

', '\n') book_info_dict['摘要'] = book_abstract for key in book_info_dict: if book_info_dict[key] is None: book_info_dict[key] = '' if ris_flag: if series is None: save_dir = ris_dir + book_info_dict['书名'].replace('/', '') + '.ris' else: save_dir = ris_dir + series + '.ris' save_dir = save_dir.replace('\n', '') with codecs.open(save_dir, mode='a', encoding='utf-8') as f: f.write("TY - BOOK" + "\n") if len(book_info_dict['译者']) >= 1: for i in range(len(book_info_dict['译者'])): f.write("A4 - " + str(book_info_dict['译者'][i]) + "\n") if not book_info_dict['摘要'] is None: f.write("AB - " + book_info_dict['摘要'] + "\n") if len(book_info_dict['作者']) >= 1: for i in range(len(book_info_dict['作者'])): f.write("AU - " + str(book_info_dict['作者'][i]) + "\n") f.write("DA - " + book_info_dict['出版年'] + "\n") f.write("PY - " + book_info_dict['出版年'][0:4] + "\n") f.write("KW - " + book_info_dict['标签'] + "\n") f.write("PB - " + book_info_dict['出版社'] + "\n") f.write("SN - " + book_info_dict['ISBN'] + "\n") f.write("T2 - " + book_info_dict['丛书'] + "\n") f.write("TI - " + book_info_dict['书名'] + "\n") f.write("ER - " + "\n") f.write("\n") return book_info_dict['书名']

用tkinter制作GUI,实现功能:

class tkURL(object):

    def __init__(self):
        self.top = Tk()
        self.entry = Entry(self.top, width=50)
        self.entry.pack()

        self.cwd = StringVar(self.top)
        self.cwd.set('None')

        self.label = Label(self.top, textvariable=self.cwd)
        self.label.pack()

        self.frame = Frame(self.top)
        self.bClip = Button(self.frame, text='Clip', command=self.entryClip,
                            activeforeground='white', activebackground='red')
        self.bUrl = Button(self.frame, text='URL', command=self.getEntry,
                           activeforeground='white', activebackground='blue')
        self.bQuit = Button(self.frame, text='Quit', command=self.top.quit,
                            activeforeground='white', activebackground='red')
        self.bClip.pack(side=LEFT)
        self.bUrl.pack(side=LEFT)
        self.bQuit.pack(side=RIGHT)
        self.frame.pack()

    def getEntry(self, ev=None):
        url = self.entry.get()
        title = book_info(url, ris_dir='')
        title = os.getcwd() + title
        self.cwd.set(title)

    def entryClip(self, ev=None):
        self.entry.delete(0, END)
        content = pyperclip.paste()
        self.entry.insert(0, content)

最后用pyinstaller制作exe。完成!

你可能感兴趣的:(python)