PyCharm 2021.1.2 x64
爬取的目标网页
import requests
from bs4 import BeautifulSoup
url = "http://58921.com/alltime/wangpiao"#目标网页
response = requests.get(url)
#print(response.text)
response.encoding = "utf-8"
text = response.text
bs = BeautifulSoup(text,'lxml')
#print(bs)
table = bs.find('table',attrs={'class':'center_table table table-bordered table-condensed'})
#print(table)
thead = table.find('thead')
#print(thead)
tbody = table.find('tbody')
#print(tbody)
f = open('中国电影票房.csv',mode="w",encoding="UTF-8")
ths = thead.find_all('th')
#print(ths),
trs = tbody.find_all('tr')
for th in ths:
if th==0:
break
f.write(th.text)
f.write(",")
f.write("\n")#换行写
for tr in trs:
if tr==0:
break
tds = tr.find_all("td")
for td in tds:
if td==0:#最后一个也被写后退出
break
f.write(td.text)
f.write(",")#换列写
f.write("\n")#换行写
C3没有数据
原因是在网页原代码中这一数据是通过img标签(png格式图片)来显示的,不是网页文本显示的,我的想法是利用python文字识别技术来识别这张图片(识别中文需格外下载中文语言包),之后再写入csv文件。
目前还在努力实现中…