刚刚接触爬虫,模仿之前写的代码对80s网站的电影信息进行爬取,爬取的网址为80s
使用的库
import re # 正则表达式
import urllib.request, urllib.error # 指定url,获取网页数据
from bs4 import BeautifulSoup # 网页解析
爬虫代码
from api import test as t
# 引入第三方模块
import re # 正则表达式
import urllib.request, urllib.error # 指定url,获取网页数据
from bs4 import BeautifulSoup # 网页解析
baseurl = 'https://www.80s.tw/hot'
imglink = re.compile(r'')
titlelink = re.compile(r'')
findlink = re.compile(r'') # 创建正则表达式 表示规则
# 1.爬取网页
def getData():
urllist = []
valuelist = []
# 2.解析数据
img = []
src = []
title = []
fens = []
contents = []
html = askURL(baseurl)
bs = BeautifulSoup(html, "html.parser")
for item in bs.find_all('div', class_="lpelmt2 me2li"):
item = str(item)
titlel = re.findall(titlelink, item)
for t in titlel:
title.append(t[1])
print(t[1])
tsrc = "https://www.80s.tw" + t[0]
fen, content = getContentAndFen(tsrc)
# fen, content = "6","2"
fens.append(fen)
contents.append(content)
src.append(tsrc)
print(fen,content)
imgl = re.findall(imglink, item)
for i in imgl:
img.append("https:" + i[2])
return title, img, src, fens, contents;
# 得到一个url的网页内容1
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"Cookie": "BAIDU_SSP_lcr=https://www.baidu.com/link?url=HMnQR6d-rPO0YlyHtrIM7E4dn4YUvW6Vm1bNsMLt4WO&wd=&eqid=e3e4166c0000b93600000003603caae8; Hm_lvt_caa88905362b7957005130b28e579d36=1614588653; _ga=GA1.2.617854400.1614588655; _gid=GA1.2.1808945187.1614588655; beitouviews_3758=OUHKI5ksCimBxsKCLklg%252BlwvUZh1FuJ6Vyi9m6XmS6eaAV9W6jgPS14FvCyFS4GHUf3YfgIhBBj5A%252FQLXbdsgSrgYpyHGtzo%252BLBHH0vHJdqh8jvMZEDRH%252FSbbFZITKsr5ErvsUY2Ao%252B5ID8ZFZIeOtAU%252F%252F6wFTelIC3oCspNs%252BbSHJcV2GtqrjikD4mrMGEkdsd3tL0z9v6mHtZh8cPS48AvWQtlpbvQi%252F6jyNUEP1ziCm9fHUmufiDHQEPZNMx0LXzlQATlHuRiScjiXziIgn9w%252BXqCyODFwuwkhDsdEmE1W%252FpFNiIfS9FE1Om0jr22Ig5Ybaavihtfb4NPt89qtQ%253D%253D; 3758_2470_111.36.138.122=1; richviews_3760=tNiZFpEMqXWe%252BFIoHRJd6y6X7RfaTIM3payNSGO2qHjxpAF9DWNOhKKdRJppp4O4V5EHhtbdcrsdgMHtJ04HLqx%252B94djknSuo1i%252B4mFZgv1hOId%252FB49VuDfByAxn5GkjahAWEq3XZww2iosVDdJQtudDjU5V%252BZH17hqG%252FQQB0XHUTOpmaLSMwQB8uoBynw%252F3xAd0ZnPNenng5MOlP2jZBh4%252Fnyan4yKv1zE33NWayTbIyXKnk1NVN1xaiKlRWO6r2Xo9b71Uk97wu9TAG9qJ54szIm90ke%252BDsPoBO1M3ZjeLBgPwN%252F9djQV6daKpCeJjPJqkY2tzbrxnKvddMmFJ1Q%253D%253D; 3760_2444_111.36.138.122=1; Hm_lpvt_caa88905362b7957005130b28e579d36=1614588658"
}
req = urllib.request.Request(url=url, headers=head)
html = ""
try:
response = urllib.request.urlopen(req)
html = response.read()
except Exception as result:
print(result)
return html
def getContentAndFen(url):
contentlink = re.compile(r'剧情介绍:(.*?)<', re.S)
fenlink = re.compile(r'(.*?)', re.S)
html = askURL(str(url))
f = ""
c = ""
bs = BeautifulSoup(html, "html.parser")
for item in bs.find_all('div', class_="info"):
item = str(item)
content = re.findall(contentlink, item)
fen = re.findall(fenlink, item)
if len(fen) > 0:
f = fen[0]
if len(content) > 0:
c = content[0]
return f, c