python爬取豆瓣读书top250

python爬取豆瓣读书top250,并保存在本地。
分别用requests+re、requests+xpath 完成
1.requests + re

import requests
import re

def getHtmltext(url):
    try:
        html = requests.get(url)
        html.raise_for_status()
        html.encoding = html.apparent_encoding
        return html.text
    except:
        return None

def parserhtml(html):
    List1 = re.findall(r'; title="(.*?)".*?

(.*?)

'
,html,re.S) return List1 def savexlsx(List): with open(r"C:\Users\**\Desktop\top250.csv","a",encoding="utf-8") as f: for i in List: f.write(i[0]+i[1]+'\n') def main(): for i in range(10): url = "https://book.douban.com/top250?start=" + str(i * 25) html = getHtmltext(url) List = parserhtml(html) savexlsx(List) if __name__ == '__main__': main()

2.requests+xpath
此篇没有保存到本地

import requests
from lxml import etree

for i in range(10):
    url = "https://book.douban.com/top250?start={}".format(i * 25)
    r = requests.get(url).text
    data = etree.HTML(r)
    files = data.xpath('//*[@id="content"]/div/div[1]/div/table')
    for file in files:
        remake = file.xpath('./tr/td[2]/div[2]/span[2]/text()')
        title = file.xpath('./tr/td[2]/div[1]/a/@title')
        href = file.xpath('./tr/td[2]/div[1]/a/@href')
        scrible = file.xpath('./tr/td[2]/p[2]/span/text()')
        num = file.xpath('./tr/td[2]/div[2]/span[3]/text()')
        if len(scrible)>0:
            print(title[0],href[0],remake[0],scrible[0],num[0].strip(')').strip("(").strip())
        else:
            print(title[0],href[0],remake[0],num[0].strip(')').strip("(").strip())

你可能感兴趣的:(学习笔记)