爬取起点玄幻的简单信息~~纯属娱乐(新手上路,太难的也爬不到)

import requests
from lxml import etree
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
}
def parse_url(url):
    book_details = {}
    response = requests.get (url=url, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    book_name = html.xpath("//div[@class='book-info ']/h1/em/text()")[0]
    book_atuhor = html.xpath("//div[@class='book-info ']/h1/span/a/text()")[0]
    book_details["book_name"] = book_name
    book_details["book_atuhor"] = book_atuhor
    book_infos = html.xpath("//div[@class='book-intro']/p/text()")
    book_info = ""
    for x in range(len(book_infos)):
        book_info += book_infos[x].strip()
    book_details["book_info"] = book_info
    book_details["book_url"] = url
    return book_details

def get_page_urls(url):
    response = requests.get(url=url,headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    # print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
    all_infos = html.xpath("//div[@class='book-mid-info']/h4/a/@href")
    #得到每页所有的书本
    for info in all_infos:
        info = "https:" + info
        book_details = parse_url(info)    #得到书本的详细信息返回输出
        print(book_details)
def splider():
    url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}"
    #得到所有的页数
    for x in range(1,10):
        base_url = url.format(x)
        get_page_urls(base_url)


if __name__ == "__main__":
    splider()

你可能感兴趣的:(爬虫)