Python多线程爬虫

实现了一个简单的多线程爬虫, 爬取百度贴吧某个帖子的回帖用户、回帖内容和回帖时间。

1. 使用pool.map实现一个简单的多线程效果。

2.使用xpath,代替查找正则表达式的方法。

# -*-coding:utf-8-*-

import re
import time
import requests
import json
import sys

reload(sys)
sys.setdefaultencoding("utf-8")
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool


def towrite(contentdict):
    f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n');
    f.writelines(u'回帖内容:' + unicode(contentdict['topic_reply_content']) + '\n');
    f.writelines(u'回帖人: ' + contentdict['user_name'] + '\n\n');


def spider(url):
    html = requests.get(url);
    selector = etree.HTML(html.text);
    content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
    item = {}
    for each in content_field:
        reply_info = json.loads(each.xpath('@data-field')[0].replace('"', ''))
        autor = reply_info['author']['user_name'];
        reply_time = reply_info['content']['date']
        content = each.xpath('div[@class="d_post_content_main"]/div[1]/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
        # print content
        # print reply_time
        # print autor
        item['user_name'] = autor;
        item['topic_reply_content'] = content;
        item['topic_reply_time'] = reply_time;
        towrite(item);


if __name__ == '__main__':
    pool = ThreadPool(8);
    f = open('content.txt', 'a');
    page = []
    for i in range(1, 21):
        newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
        page.append(newpage);
    pool.map(spider, page);
    pool.close();
    pool.join();
    f.close;


你可能感兴趣的:(Python网络爬虫,Python项目实战开发)