平时比较喜欢看电影,每次去资源站搜索感觉很不方便,闲来无事就写了个小虫把电影站点给爬了。。废话不多说,看源码。
避免广告,资源站地址隐藏。
第一步,抓取所有电影链接,保存到表movieall中
from html.parser import HTMLParser
import urllib.request
import sys
import sqlite3
import time
import re
types = [['Dongzuodianying',99],['Kehuandianying',32],['Kongbudianying',84],['Xijudianying',121],['Aiqingdianying',54],['Juqingdianying',275],['Zhanzhengdianying',14]];
baseurl= "https://www.*****.com/"
def write_to_db(name,mtype,url,img):
conn = sqlite3.connect("D:\\movie\\movie.db")
sql = '''insert into movieall(name,type,url,imgurl) values(:ssname,:sstype,:ssurl,:ssimg)'''
conn.cursor().execute(sql,{'ssname':name, 'sstype':mtype, 'ssurl':url,'ssimg':img})
conn.commit()
conn.cursor().close()
conn.close()
for stype in types:
for i in range(1,stype[1],1):
url=baseurl+stype[0]+"/chart/"+str(i)+".html"
print(url)
html = urllib.request.urlopen(url).read().decode("gbk")
re_li= r'(.*?) '
li_list= re.findall(re_li,html,re.S|re.M)
for li in li_list:
#print(li)
info= r'0:
#print(infolist[0][0])
write_to_db(infolist[0][1],stype[0],infolist[0][0],infolist[0][2])
第二步,遍历表movieall,解析电影信息并保存到表movieinfo中,包括电影名称,类型,海报地址,下载地址,下载类型,电影简介等信息。
from html.parser import HTMLParser
import urllib.request
import urllib.error
import sys
import sqlite3
import time
import re
types = [['Dongzuodianying',99],['Kehuandianying',32],['Kongbudianying',84],['Xijudianying',121],['Aiqingdianying',54],['Juqingdianying',275],['Zhanzhengdianying',14]];
baseurl= "https://www.******.com/"
def write_to_db(name,mtype,url,img,downurl,downtitle,downtype,desc):
conn = sqlite3.connect("D:\\movie\\movieall.db")
sql = '''insert into movieinfo(name,type,url,imgurl,downurl,downtitle,downtype,moviedesc) values(:ssname,:sstype,:ssurl,:ssimg,:ssdownurl,:ssdowntitle,:ssdowntype,:ssdesc)'''
conn.cursor().execute(sql,{'ssname':name, 'sstype':mtype, 'ssurl':url,'ssimg':img,'ssdownurl':downurl,'ssdowntitle':downtitle,'ssdowntype':downtype,'ssdesc':desc})
conn.commit()
conn.cursor().close()
conn.close()
conn = sqlite3.connect("D:\\movie\\movie.db")
cur = conn.cursor()
sqlstr='SELECT name,type,url,imgurl,rowid FROM movieall'
for row in cur.execute(sqlstr):
print (row[0],row[1],row[4])
url=row[2]
try:
html = urllib.request.urlopen(url).read().decode("gbk")
#re_div= r'(.*?)'
re_div= r''
div_list= re.findall(re_div,html,re.S|re.M)
re_desc= r'(.*?)'
desc_list= re.findall(re_desc,html,re.S|re.M)
#print(desc_list)
for div in div_list:
#print(div)
write_to_db(row[0],row[1],row[2],row[3],div[0],div[2],div[1],desc_list[0]);
except urllib.error.HTTPError as e:
print(e.code)
except UnicodeDecodeError as e:
print(e)
time.sleep(0.1)
cur.close()
conn.close()
好了,开跑吧,效率虽然不怎么样,不过还是达到目的了,大概抓了3万部电影,8万条记录。不知道有生之年能不能看完。。。
如果也有喜欢电影的朋友请自取: