一只小虫子,360视频(新手轻点吐槽)

运行

sp=Spider()

sp.go()

import urllib.request
import re
import sys
import threading

class Spider:
    def __init__(self):
        self.category={"dianying":"http://www.360kan.com/dianying/list.php","dianshi":"http://www.360kan.com/dianshi/list.php"}
    def go(self):
        dataAll=[]
        for i in self.category.items():
            mtype=i[0]
            baseurl=i[1]
            for n in range(1,21):
                thread=myThread(n,mtype,baseurl)
                thread.start()
        # print(dataAll)
    def getHtml(self,url):# 获取html内容
        response=urllib.request.urlopen(url)
        if response.getcode() != 200:
            exit()
        html=response.read().decode('utf-8')
        return html
    def matchItems(self,html,mtype):#匹配详情
        items=[]
        if(mtype=='dianying'):
            out=re.findall(r'''<li class="le-figure le-figure-horizontal">.*?</li>''',html,re.S)
        elif mtype=='dianshi':
            out=re.findall(r'''<li\s*id="movie-.*?"\s*class="le-figure le-figure-horizontal">.*?</li>''',html,re.S)
        for i in out:
            # url
            pattern= re.compile(r'<div class="pic">\s*<a\s*href="(.*?)".*?</div>',re.S)
            url="http://www.360kan.com/" + pattern.findall(i)[0]

            # 封面图片
            if mtype=='dianying':
                pattern= re.compile(r'<img\s*data-src="(.*)"\s*/>',re.S)
            elif mtype=='diashi':
                pattern= re.compile(r'<img\s*src="(.*)"\s*/>',re.S)
            img=pattern.findall(i)[0]

            # 片名
            pattern = re.compile(r'<p\s*class="video-title">.*?<a.*?>(.*?)</a>.*?</p>',re.S)
            title = pattern.findall(i)[0]

            # 导演
            if mtype=='dianying':
                pattern = re.compile(r'<span class="text">导演:</span>(.*?)</p>',re.S)
                director=pattern.findall(i)[0]
            elif mtype=='dianshi':
                director=''

            # 主演
            pattern = re.compile(r'<span class="text">主演:</span>(.*?)</p>',re.S)
            main_actors_html=pattern.findall(i)[0]
            pattern=re.compile(r'<a.*?>(.*?)</a>')
            main_actors=pattern.findall(main_actors_html)

            # 年代
            pattern=re.compile(r'<span class="text">年代:</span>(.*?)</p>')
            year=pattern.findall(i)[0]

            # 类型
            if mtype=='dianying':
                pattern=re.compile(r'<span class="text">看点:</span>(.*?)</p>',re.S)
                rsl=pattern.findall(i)
                if len(rsl)>0:
                    result_type_html=rsl[0]
                    pattern=re.compile(r'<a.*?>(.*?)</a>',re.S)
                    types=pattern.findall(result_type_html)
                else:
                    types=''
            elif mtype=='dianshi':
                types=''

            items.append((url,title,main_actors,year,types))
        return items


class myThread(threading.Thread):
    def __init__(self,n,mtype,baseurl):
        self.n=n
        self.mtype=mtype
        self.baseurl=baseurl
        threading.Thread.__init__(self)
    def run(self):
        spider=Spider()
        html=spider.getHtml(self.baseurl+'?pageno='+str(self.n))
        if html:
            items=spider.matchItems(html,self.mtype)
        else:
            items=''
        print(items)
        return items


你可能感兴趣的:(一只小虫子,360视频(新手轻点吐槽))