第一个小爬虫代码

# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup
from urllib import parse

url = 'https://www.douban.com/tag/{}/?focus=book'.format(parse.quote('小说'))
res = urllib.request.urlopen(url)
soup = BeautifulSoup(res,"html.parser")
book_div = soup.find(attrs={"id":"book"})
book_a = book_div.findAll(attrs={"class":"title"})
for book in book_a:
    print(book.string)
    

import requests
from lxml import etree

res = requests.get(url)
root = etree.HTML(res.content)
book_a = root.xpath("//*[@id = 'book']//*[@class = 'title']/text()")
print(book_a)

分别用urllib+beautifulsoup和request+xpath方法爬取和解析网页内容

你可能感兴趣的:(第一个小爬虫代码)