# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import sys
import pandas as pd
import re
import requests
def sgw():
s=requests.session()
headers={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'movie.mtime.com',
'Referer':'http://movie.mtime.com/boxoffice/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}
s.headers.update(headers)
df = pd.DataFrame(columns=('排名', '电影', '类型', '首日票房(元)', '年度票房(元)', '上映日期'))
x=0
for i in range(10):
url='http://movie.mtime.com/boxoffice/?year=2018&area=china&type=MovieRankingYear&category=all&page={}&display=table×tamp=1547015331595&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json'.format(str(i))
req=s.get(url=url,verify=False).text
bs = BeautifulSoup(req, 'lxml')
tr=bs.find_all('tr')
for j in tr[1:]:
td=j.find_all('td')
list=[]
for k in range(6):
if k==1:
nm=td[k].find('a').text
print(td[k].a.string)
list.append(nm)
else:
list.append(td[k].text)
df.loc[x] = list
x=x+1
print(df)
df.to_excel('时光网.xlsx', index=False, encoding="GB18030")
sgw()