先上代码
#coding=utf-8
import re
import urllib.request
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('utf-8')
return html
def getItem(html):
reg = re.compile(r'.*?(.*?).*?.*?(\d+).*?
.*? .*?(\d+)人评价',re.S)
items = re.findall(reg,html)
global index
for index,item in enumerate(items,index+1):
print (index,item)
if __name__=='__main__':
index = 0
for i in range(0,226,25):
url = "https://movie.douban.com/top250?start="
url += str(i) + "&filter="
html = getHtml(url)
getItem(html)
print ("\nOK!All OVER!")
#关于正则表达式的一些说明
#(.*?) 获取电影名字
#
.*?(\d+) 获取电影上映年份
# 获取评分
#.*?(\d+)人评价 获取评价人数
如果想要代码看起来更优雅点,可以去掉
global index
index = 0
再将 index+1 改成1,只是不能按顺序统计了。
#coding=utf-8
import re
import urllib.request
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('utf-8')
return html
def getItem(html):
reg = re.compile(r'.*?(.*?).*?.*?(\d+).*?
.*? .*?(\d+)人评价',re.S)
items = re.findall(reg,html)
for index,item in enumerate(items,1):
print (index,item)
if __name__=='__main__':
for i in range(0,226,25):
url = "https://movie.douban.com/top250?start="
url += str(i) + "&filter="
html = getHtml(url)
getItem(html)
print ("\nOK!All OVER!")
#coding=utf-8
import re
import requests
from prettytable import PrettyTable
from colorama import init,Fore
def getHtml(url):
page = requests.get(url)
html = page.text
return html
def getItem(html):
reg = re.compile(r'.*?(.*?).*?.*?(\d+).*?
.*? .*?(\d+)人评价',re.S)
items = re.findall(reg,html)
return items
if __name__=='__main__':
movie = []
init(autoreset=True)
table = PrettyTable([Fore.RED +"排名", "电影名",'上映年份','综合评分','评价人数'])
for i in range(0,226,25):
url = "https://movie.douban.com/top250?start="
url += str(i) + "&filter="
html = getHtml(url)
items = getItem(html)
for item in items:
movie.append(item)
for index,item in enumerate(movie,1):
if index % 4 == 0:
color = Fore.RED
elif index % 4 == 1:
color = Fore.YELLOW
elif index % 4 == 2:
color = Fore.GREEN
else:
color = Fore.CYAN
table.add_row([color + str(index),item[0],item[1],item[2],item[3]])
print (table)