Python爬虫实战(一):爬取豆瓣电影top250排名

先上代码


#coding=utf-8
import re
import urllib.request

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    html = html.decode('utf-8')
    return html

def getItem(html):
    reg = re.compile(r'.*?(.*?).*?

.*?(\d+).*?

.*?(.*?).*?(\d+)人评价',re.S) items = re.findall(reg,html) global index for index,item in enumerate(items,index+1): print (index,item) if __name__=='__main__': index = 0 for i in range(0,226,25): url = "https://movie.douban.com/top250?start=" url += str(i) + "&filter=" html = getHtml(url) getItem(html) print ("\nOK!All OVER!")


#关于正则表达式的一些说明
#(.*?)  获取电影名字
#

.*?(\d+) 获取电影上映年份
#(.*?) 获取评分
#.*?(\d+)人评价  获取评价人数   


如果想要代码看起来更优雅点,可以去掉

global index

index = 0

再将  index+1 改成1,只是不能按顺序统计了。


#coding=utf-8
import re
import urllib.request

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    html = html.decode('utf-8')
    return html

def getItem(html):
    reg = re.compile(r'.*?(.*?).*?

.*?(\d+).*?

.*?(.*?).*?(\d+)人评价',re.S) items = re.findall(reg,html) for index,item in enumerate(items,1): print (index,item) if __name__=='__main__': for i in range(0,226,25): url = "https://movie.douban.com/top250?start=" url += str(i) + "&filter=" html = getHtml(url) getItem(html) print ("\nOK!All OVER!")



#coding=utf-8
import re
import requests
from prettytable import PrettyTable
from colorama import init,Fore

def getHtml(url):  
    page = requests.get(url)  
    html = page.text  
    return html

def getItem(html):
    reg = re.compile(r'.*?(.*?).*?

.*?(\d+).*?

.*?(.*?).*?(\d+)人评价',re.S) items = re.findall(reg,html) return items if __name__=='__main__': movie = [] init(autoreset=True) table = PrettyTable([Fore.RED +"排名", "电影名",'上映年份','综合评分','评价人数']) for i in range(0,226,25): url = "https://movie.douban.com/top250?start=" url += str(i) + "&filter=" html = getHtml(url) items = getItem(html) for item in items: movie.append(item) for index,item in enumerate(movie,1): if index % 4 == 0: color = Fore.RED elif index % 4 == 1: color = Fore.YELLOW elif index % 4 == 2: color = Fore.GREEN else: color = Fore.CYAN table.add_row([color + str(index),item[0],item[1],item[2],item[3]]) print (table)


你可能感兴趣的:(Python,爬虫)