抓知乎某专题下所有提问以及连接

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

def getcontent(url):
    html = requests.get(url,headers=headers).text
    selector = etree.HTML(html)
    content = selector.xpath('//div[@id="zh-list-answer-wrap"]/div[@class="zm-item"]/h2[@class="zm-item-title"]')
    for each in content:
        re_href = 'https://www.zhihu.com'+each.xpath('a/@href')[0]
        re_txt = each.xpath('a/text()')[0]
        url_list = url.split('=')
        f.write('第'+str(url_list[1])+'页\t'+re_href+'\t'+re_txt+'\n')

if __name__ == '__main__':
    headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
            "Accept-Encoding":"gzip",
            "Accept-Language":"zh-CN,zh;q=0.8",
            "Referer":"http://www.example.com/",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
            }
    pool = ThreadPool(4)
    f = open('zhihu.txt','a')
    page = []
    for i in range(1,36):
        newpage = 'http://www.zhihu.com/collection/27109279?page=' + str(i)
        page.append(newpage)
    results = pool.map(getcontent, page)
    pool.close()
    pool.join()
    f.close()


你可能感兴趣的:(抓知乎某专题下所有提问以及连接)