import requests
from bs4 import BeautifulSoup
import json
import re
import codecs
import time
headers = {
'Cookie':'xxxxxxxx',
'Host':'movie.douban.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0',
}
def get_html(url):
try:
r = requests.get(url,headers = headers)
except Exception as e :
print("失败服务器返回:%s" % e)
return
return(r.text)
def get_movies(html):
result=json.loads(html)['data']
return(result)
def get_movie_info(movies):
for movie in movies:
html = get_html(movie['url'])
soup = BeautifulSoup(html,'lxml')
movie_info = soup.find('div',attrs={'id':'info'})
movie_info_text = movie_info.get_text()
directed = "".join([str(x) for x in re.findall('导演: (.*)',movie_info_text)]).replace(" / ", ";")
star = "".join([str(x) for x in re.findall('主演: (.*)',movie_info_text)]).replace(" / ", ";")
language = "".join([str(x) for x in re.findall('语言: (.*)',movie_info_text)]).replace(" / ", ";")
releaseDate = "".join([str(x) for x in re.findall('上映日期: (.*)',movie_info_text)]).replace(" / ", ";")
genre = "".join([str(x) for x in re.findall('类型: (.*)',movie_info_text)]).replace(" / ", ";")
movie = "%s|%s|%s|%s|%s|%s|%s|%s|%s\n" % (movie['title'],movie['rate'],movie['url'],movie['cover'],genre,directed,star,language,releaseDate)
print(movie)
file_object.write(movie)
time.sleep(1)
def movies_down(start,end):
for page in range(start,end,20):
url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start='
url = url+str(page)
html = get_html(url)
movies = get_movies(html)
get_movie_info(movies)
file_object = codecs.open('douban_movies.txt', 'a' ,"utf-8")
movies_down(700,1300)
file_object.close()