利用python--正则--爬取豆瓣TOP250排行榜

话不多说直接贴上代码
import requests
import re
import json
#定义一个函数,用于解析网页内容
def pase_detail(url):
content = requests.get(url).content.decode()
movie_content=re.findall(’

  • .?
    .? (.?).?

    .?(.?)
    .?(\d{4}) .?(.*?)’,content,re.S)

    for item in movie_content:
        movie_name=item[0]
        movie_direct=item[1].replace("\n","").strip().replace("   ","--")
        movie_year=item[2]
        movie_score=item[3]
        yield {
        "movie_name":movie_name,
        "movie_direct":movie_direct,
        "movie_year":movie_year,
        "movie_score":movie_score,
        }
    
  • #定义一个函数用于获取后的文本存储
    def write_content(movie_info):
    with open(‘douban_top250.txt’,‘a+’,encoding=‘utf-8’) as fp:
    fp.write(json.dumps(movie_info,ensure_ascii=False)+’\n’)

    if name==“main”:
    url = ‘https://movie.douban.com/top250?start={}&filter=’
    for page in range(11):
    for movie_msg in pase_detail(url.format(page*25)):
    print(movie_msg)
    write_content(movie_msg)

    你可能感兴趣的:(爬虫)