爬取中华诗词网所有诗词

今天给大家分享一个爬取中华诗词网所有诗词的python爬虫,本项目主要使用了pyquery库和request库(其实只用pyquery就可实现)废话不多说,直接上代码

from time import sleep

import requests
from pyquery import PyQuery as pq

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
base_url = "https://www.shi-ci.com"
def get_index_page(url):
    urls = []
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        results = doc("li")
        # print(results)
        for result in results.items():
            result = str(result.children().attr("href"))
            if len(result)>5:
                urls.append(base_url+"/"+result)
        return urls
    return None

def get_author_page(url):
    urls = []
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        results = doc(".poem-preview")
        for result in results.items():
            result = str(result.children().attr("href"))
            if len(result) > 5:
                urls.append(base_url + "/" + result)
        return urls
    return None

def get_poem_list(url):
    urls = []
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        results = doc(".poem-preview")
        for result in results.items():
            result = str(result.children().attr("href"))
            if len(result) > 5:
                urls.append(base_url + "/" + result)
        return urls
    return None

# def get_poem_page(url):
#     response = requests.get(url, headers=headers)
#     if response.status_code == 200:
#         doc = pq(response.text)
#         title = doc("#poem>h1").text()
#         year = doc("#poem>h3").text()
#         content = doc("#poem>div").text()
#         try:
#             with open("poems.csv","a",encoding="utf-8") as csvf:
#                 filenames = ["title","year","content"]
#                 writer = csv.DictWriter(csvf,filenames,delimiter=" ")
#                 writer.writerow({"title": title,"year": year,"content": content})
#                 print("%ssuccess"%title)
#         except ValueError as e:
#             print(e)
#     return None
def get_poem_page(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        title = doc("#poem>h1").text()
        year = doc("#poem>h3").text()
        content = doc("#poem>div").text()
        with open("poems.txt","a",encoding="utf-8") as f:
            result = title+"\n"+year+"\n"+content+"\n-------------------\n"
            f.write(result)
            print("%ssuccess" % title)
    return None
if __name__ == "__main__":
    # urls = get_index_page(base_url)[:2]
    # for url in urls:
    #     author_urls = get_author_page(url)[:2]
    #     for author_url in author_urls:
    #         poem_urls = get_poem_list(author_url)
    #         for poem_url in poem_urls:
    #             # sleep(0.5)
    #             get_poem_page(poem_url)
    urls = get_index_page(base_url)
    for url in urls:
        author_urls = get_author_page(url)
        for author_url in author_urls:
            sleep(0.5)
            poem_urls = get_poem_list(author_url)
            for poem_url in poem_urls:
                get_poem_page(poem_url)

喜欢就留下你的小

你可能感兴趣的:(爬取中华诗词网所有诗词)