多线程爬取豆瓣例子

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 多线程抓取猫眼电影.py
# @Time     : 2019/4/8 10:27
# @Software : PyCharm

#-*- coding: utf-8 -*-
import re
import os
import time
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_one_page(url):
    '''
    获取网页html内容并返回
    '''
    try:
        # 获取网页html内容
        response = requests.get(url)
        # 通过状态码判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        print("打印错误信息:", e)
        return None


def parse_one_page(html):
    '''
    解析HTML代码,提取有用信息并返回
    '''
    # 正则表达式进行解析
    pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">' + '(.*?).*?"star">(.*?)

.*?releasetime">(.*?)

' + '.*?integer">(.*?).*?fraction">(.*?).*?
', re.S) # 匹配所有符合条件的内容 items = re.findall(pattern, html) for item in items: yield { 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5] + item[6] } def write_to_file(content): ''' 将文本信息写入文件 ''' with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def save_image_file(url, path): ''' 保存电影封面 ''' ir = requests.get(url) if ir.status_code == 200: with open(path, 'wb') as f: f.write(ir.content) f.close() def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) # 封面文件夹不存在则创建 if not os.path.exists('covers'): os.mkdir('covers') # print(list(parse_one_page(html))) for item in parse_one_page(html): print(item) write_to_file(item) save_image_file(item['image'], 'covers/' + '%03d'%int(item['index']) + item['title'] + '.jpg') if __name__ == '__main__': start = time.time() print("程序开始时间:", start) # 使用多进程提高效率 pool = Pool() # a = [0, 10, 20, 30, 40, 50 , 60, 70, 80,90] # for i in a: # main(i) # 9.410544395446777 循环时间 pool.map(main,[i*10 for i in range(10)]) end = time.time() print("程序结束时间:", end) print("总时间为: end - start", end - start) # 2.1735799312591553 多进程时间

这个爬取的效率还是相当高的, 开了 10 个进程一起抓取, 使用 map函数直接进行, 这些的数据量并不是很大, 但是也能比较的出来, 速度相差4倍。 也就是说 四个小时的东西, 一个小时就可以解决了。

from multiprocessing.dummy import Pool as ThreadPool 的用法

from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
import sys
import time


def spider(url):
    print(url)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    html = requests.get(url, headers=headers) #伪装成浏览器
    selector = etree.HTML(html.text) #将网页html变成树结构,用于xpath
    content = selector.xpath('//figure[@class="post-image "]') #提取figure标签
    for each in content:
        tmp = each.xpath('a/img/@src')#把img标签的src属性提取出来
        pic = requests.get(tmp[0])#访问图片
        print('downloading: ' + tmp[0])
        string = re.search(r'\d+/\d+/(.*?)\\.jpg', str(tmp[0])).group(1) #正则表达式匹配图片名字

        with open('pic2\\'+string+'.jpg', "wb") as f:
            f.write(pic.content)

if __name__ == '__main__':
    pool = ThreadPool(2) #双核电脑
    pool.map(spider, ['http://hotpics.cc/page/' + str(i) for i in range(1,11)])  #多线程工作
    pool.close()
    pool.join()
# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 昆山.py
# @Time     : 2019/5/23 11:24
# @Software : PyCharm
import os
import re
import time
import requests
import pandas as pd
from lxml import etree
from multiprocessing import Pool


def index(df = pd.DataFrame()):
    index_url = "http://www.kshome.com.cn/Ksht/RoomInfo.aspx"
    headers = {
        'Cookie':'ASP.NET_SessionId=3vipldp1m2awav0zm14dplv2',
        'Host':'www.kshome.com.cn',
        'Origin':'http://www.kshome.com.cn',
        'Referer':'http://www.kshome.com.cn/Ksht/RoomInfo.aspx?id=20819',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    }
    params = {"id" :20819}
    for i in range(1, 14):
        print("当前抓取页码:", i)
        index_res = requests.get(url=index_url, params=params)
        VIEWSTATE = re.findall(r'name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', index_res.text, re.DOTALL)[0]
        EVENTVALIDATION = re.findall(r'name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', index_res.text, re.DOTALL)[0]
        data = {
            '__EVENTTARGET': 'GridView1$_ctl43$btnGo',
            '__EVENTARGUMENT':  '',
            '__VIEWSTATE': VIEWSTATE,
            '__EVENTVALIDATION': EVENTVALIDATION,
            'txtCs': '',
            'txtSh': '',
            'GridView1:_ctl43:txtNewPageIndex': i,
        }
        response = requests.post(url = index_url, headers = headers, params = params, data = data)
        html = etree.HTML(response.text)
        a = html.xpath("//table[@id='GridView1']//tr/td[1]/text()")
        b = html.xpath("//table[@id='GridView1']//tr/td[2]/text()")
        c = html.xpath("//table[@id='GridView1']//tr/td[3]/text()")
        d = html.xpath("//table[@id='GridView1']//tr/td[4]/text()")
        e = html.xpath("//table[@id='GridView1']//tr/td[5]/text()")
        a = a[:len(b)]
        df1= pd.DataFrame([a, b, c, d, e]).T
        df = df.append(df1, ignore_index=True)
    print("___________________________________________")
    df.drop_duplicates(inplace=True)
    return df


def main(number):
    print("AAAAAAAAAAAAAAAAAAAAAAAAAAAA", number)
    if os.path.exists("./all3.xlsx"):
        df = pd.read_excel("./all3.xlsx")
        df = index(df)
    else:
        df = index()
    print(len(df))
    df.to_excel("all3.xlsx")


if __name__ == '__main__':
    start = time.time()
    print("程序开始时间:", start)
    pool = Pool()
    pool.map(main, [i for i in range(10)])
    print("1000000000000")
    pool.close()
    pool.join()
    end = time.time()
    print("程序结束时间:", end)
    print("总时间为: end - start", end - start)


 

你可能感兴趣的:(多进程的使用)