自己写的爬虫--电影天堂

import requests,chardet
from lxml import etree
import re
import pymysql

class MysqlHelper(object):
    # 初始化的构造函数
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1',user='root',password='123456',port=3306,database='py101',charset='utf8')
        self.cursor = self.db.cursor()
    # 执行修改操作
    def mysql_do(self,sql):
        self.cursor.execute(sql)
        self.db.commit()

    # 结束函数
    def __del__(self):
        self.cursor.close()
        self.db.close()

def b (particulars_url,mysql_):

    # print(type(particulars_url))
    headers ={
        # 'Cookie': 'XLA_CI=4bcc75a53587f6ef64ae25e76968175d; UM_distinctid=1655181588f377-018dee94e06b5e-51422e1f-1fa400-16551815890901; CNZZDATA1260535040=1775506378-1534666276-%7C1534686370; cscpvcouplet4298_fidx=3; cscpvrich5041_fidx=3',
        # 'Host': 'www.dytt8.net',
        # 'If-Modified-Since': 'Sat, 18 Aug 2018 11:57:59 GMT',
        # 'If-None-Match': '"80fd25b5ea36d41:328"',
        # 'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',

    }
    response1 = requests.get(particulars_url,headers=headers)
    response1.encoding='gb2312'
    html=response1.text
    # print(html)
    # print(response1.text)
    # html_ele = response1.text
    # print(html_ele)
    html_ele = etree.HTML(html)


    # li_list = html_ele.xpath('//table[@class="tablelist textl"]/tr')
    name = html_ele.xpath('//div[@class="title_all"]/h1/font/text()')[0]
    print(name)
    # lianjie = html_ele.xpath('//div[@id="Zoom"]/span')
    # print(lianjie)
    html_href = re.search(r'', html).group(1)
    print(html_href)

    # bgcolor = "#fdfddf" > < a href = "(.*?)" >
    # name = r'(.*?)'
    sql = 'insert into py333(name,html_href)values({},{})'.format(repr(name),repr(html_href))
    print(sql)
    mysql_.mysql_do(sql)



def a ():
    for i in range(1,4):
        mysql_ = MysqlHelper()

        list_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(i)
        headers = {
            'Cookie': 'cscpvcouplet4298_fidx=1; XLA_CI=4bcc75a53587f6ef64ae25e76968175d; cscpvrich5041_fidx=1',
            'Host': 'www.dytt8.net',
            'If-Modified-Since': 'Sun, 19 Aug 2018 03:01:11 GMT',
            'If-None-Match': '"80f518e26837d41:328"',
            'Upgrade-Insecure-Requests': '1',

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
        }
        response = requests.get(list_url,headers=headers)
        # print(response)
        html_ele = etree.HTML(response.text)
        # print(html_ele)
        li_list = html_ele.xpath('//div[@class="co_content8"]/ul/td[1]/table')
        # with open( 'dytt.html','wb') as f :
        #     f.write(response.content)
        # for i in li_list:
        # print(len(li_list))
        for li_ele in li_list:
            xq = li_ele.xpath('./tr[2]/td[2]/b/a/@href')

            if len(xq) == 2:
                # xq = li_ele.xpath('./tr[2]/td[2]/b/a[2]/@href')[0]
                xq.pop(0)
                # print(xq)
            # print(xq[0])
            particulars_url = 'http://www.dytt8.net'+xq[0]
            print(particulars_url)
            # print(xq)
            b(particulars_url,mysql_)
        # print('--'*25)
# http://www.dytt8.net/html/gndy/dyzz/20180817/57299.html


if __name__ == '__main__':
        a()







你可能感兴趣的:(自己写的爬虫--电影天堂)