电影资源站python爬虫源码

平时比较喜欢看电影,每次去资源站搜索感觉很不方便,闲来无事就写了个小虫把电影站点给爬了。。废话不多说,看源码。

避免广告,资源站地址隐藏。

第一步,抓取所有电影链接,保存到表movieall中

from html.parser import HTMLParser
import urllib.request
import sys
import sqlite3
import time
import re
types = [['Dongzuodianying',99],['Kehuandianying',32],['Kongbudianying',84],['Xijudianying',121],['Aiqingdianying',54],['Juqingdianying',275],['Zhanzhengdianying',14]];
baseurl= "https://www.*****.com/"


def write_to_db(name,mtype,url,img):
        conn = sqlite3.connect("D:\\movie\\movie.db")
        sql = '''insert into movieall(name,type,url,imgurl) values(:ssname,:sstype,:ssurl,:ssimg)'''
        conn.cursor().execute(sql,{'ssname':name, 'sstype':mtype, 'ssurl':url,'ssimg':img})   
        conn.commit()
        conn.cursor().close()
        conn.close()

for stype in types:                     
        for i in range(1,stype[1],1):
                url=baseurl+stype[0]+"/chart/"+str(i)+".html"
                print(url)
                html = urllib.request.urlopen(url).read().decode("gbk")
                re_li= r'
  • (.*?)
  • ' li_list= re.findall(re_li,html,re.S|re.M) for li in li_list: #print(li) info= r'0: #print(infolist[0][0]) write_to_db(infolist[0][1],stype[0],infolist[0][0],infolist[0][2])

    第二步,遍历表movieall,解析电影信息并保存到表movieinfo中,包括电影名称,类型,海报地址,下载地址,下载类型,电影简介等信息。

    from html.parser import HTMLParser
    import urllib.request
    import urllib.error
    import sys
    import sqlite3
    import time
    import re
    types = [['Dongzuodianying',99],['Kehuandianying',32],['Kongbudianying',84],['Xijudianying',121],['Aiqingdianying',54],['Juqingdianying',275],['Zhanzhengdianying',14]];
    baseurl= "https://www.******.com/"
    
    
    def write_to_db(name,mtype,url,img,downurl,downtitle,downtype,desc):
            conn = sqlite3.connect("D:\\movie\\movieall.db")
            sql = '''insert into movieinfo(name,type,url,imgurl,downurl,downtitle,downtype,moviedesc) values(:ssname,:sstype,:ssurl,:ssimg,:ssdownurl,:ssdowntitle,:ssdowntype,:ssdesc)'''
            conn.cursor().execute(sql,{'ssname':name, 'sstype':mtype, 'ssurl':url,'ssimg':img,'ssdownurl':downurl,'ssdowntitle':downtitle,'ssdowntype':downtype,'ssdesc':desc})   
            conn.commit()
            conn.cursor().close()
            conn.close()
            
    
    conn = sqlite3.connect("D:\\movie\\movie.db")
    cur = conn.cursor()
    sqlstr='SELECT name,type,url,imgurl,rowid FROM movieall'
    for row in cur.execute(sqlstr):
        print (row[0],row[1],row[4])
        url=row[2]
        try:
            html = urllib.request.urlopen(url).read().decode("gbk")   
            #re_div= r'
    (.*?)
    ' re_div= r'
    ' div_list= re.findall(re_div,html,re.S|re.M) re_desc= r'
    (.*?)
    ' desc_list= re.findall(re_desc,html,re.S|re.M) #print(desc_list) for div in div_list: #print(div) write_to_db(row[0],row[1],row[2],row[3],div[0],div[2],div[1],desc_list[0]); except urllib.error.HTTPError as e: print(e.code) except UnicodeDecodeError as e: print(e) time.sleep(0.1) cur.close() conn.close()

    好了,开跑吧,效率虽然不怎么样,不过还是达到目的了,大概抓了3万部电影,8万条记录。不知道有生之年能不能看完。。。

    如果也有喜欢电影的朋友请自取:

    https://pan.baidu.com/s/1v5sTa6ylChUGkc00oYWu_A 提取码: unxk
    电影资源站python爬虫源码_第1张图片

    你可能感兴趣的:(爬虫)