目标网站:电影票房网
目标网址:http://58921.com/daily/wangpiao
目标数据:(1)名次(2)电影名称 (3)日期(4)票房 (5)总场次(6)废场(7)人次(8)上座率(9)票价
(1)使用urllib或requests库实现该网站网页源代码的获取,并将源代码进行保存;
(2)自主选择re、bs4、lxml中的一种解析方法对保存的的源代码读取并进行解析,成功找到目标数据所在的特定标签,进行网页结构的解析;
(3)定义函数,将获取的目标数据保存到csv文件中。
(4)使用框架式结构,通过参数传递实现整个特定数据的爬取。
import requests
import json
import csv
from requests.exceptions import RequestException
from lxml import etree
def getHtmlText(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 Edg/80.0.361.69'
}
try:
result = requests.get(url,headers=headers,timeout=30)
result.raise_for_status()
result.encoding = result.apparent_encoding
return result.text
except:
return ""
def parsePage(html):
ulist = []
clist = []
rlist = []
ilist = []
newhtml =etree.HTML(html,etree.HTMLParser())
result=newhtml.xpath('//*[@id="content"]/div[2]/table/tbody/tr/td//text()')
imgs = newhtml.xpath('//*[@id="content"]/div[2]/table/tbody/tr/td/a/img/@src', stream=True)
j = 0
for img in imgs:
j=j+1
with open(str(j)+'.png', 'wb') as fd:
picture=requests.get(img).content
fd.write(picture)
for i in range(len(imgs)):
str_ = str(i+1)+'.png'
text = pytesseract.image_to_string(Image.open(str_))
ilist.append(text.replace(" ",".").replace("M","亿").replace("a","亿"))
# print(ilist)
for i in range(len(result)):
ulist.append(result[i].replace(" ","").replace('\r',"").replace("\n",''))
while '' in ulist:
ulist.remove('')
length = len(ulist)
weight = int(length / 8 )
for i in range(weight):
for j in range(8):
clist.append(ulist[i*8+j])
clist.append(ilist[i])
rlist.append(clist)
clist = []
return rlist
# def txtdata(data):
# with open('top20.txt','w')as file:
# for i in data:
# for j in i:
# print(j)
# print('successful')
def storedata(data):
with open('top20.txt','w',encoding = 'utf-8')as file:
for i in data:
file.write(json.dumps(i,ensure_ascii=False)+'\n')
print('ok')
def csvdata(data):
with open('top20.csv','w',encoding = 'utf-8',newline='')as csvfile:
fieldnames = ['名次','电影名称','日期','票房','总场次','废场','人次','上座率','票价(元)']
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writeheader()
for i in data:
writer.writerow({
'名次':i[0],'电影名称':i[1],'日期':i[2],'票房':i[8],'总场次':i[3],'废场':i[4],'人次':i[5],'上座率':i[6],'票价(元)':i[7]})
print('ok')
def main():
url="http://58921.com/daily/wangpiao"
html=getHtmlText(url)
rlist=parsePage(html)
# txtdata(rlist)
storedata(rlist)
csvdata(rlist)
main()