9songSpider

# coding:utf-8

# 网站主页地址:https://9song.me/cy/%e5%ae%b6%e5%ba%ad%e4%ba%82%e5%80%ab/
# 每页的地址: page/1/
# 每篇文章的地址 //h2//@href
# 文章的标题: //h1
# 文章的内容:////div[@class="entry-inner"]

import urllib
import urllib2
import Queue
import threading
import requests
from lxml import etree


class Spider(object):

    def __init__(self, url, ag_header):
        self.url = url
        self.head = ag_header

    def load_page(self):
        print("[*]获取页面中每个版块的url中...")
        request = urllib2.Request(self.url, headers=self.head)
        html = urllib2.urlopen(request).read().decode("utf-8")
        path_list = etree.HTML(html).xpath('//h2//a/@href')
        for section_url in path_list:
            # self.load_image(section_url)
            print(section_url)

    def load_image(self, section_url):
        print("[*]正在获取文章数据中...")
        request = urllib2.Request(section_url, headers=self.head)
        html = urllib2.urlopen(request).read().decode("utf-8")
        file_name = etree.HTML(html).xpath('//h1/text()')
        content_txt = etree.HTML(html).xpath('//div[@class="entry-inner"]//p/text()')
        for title in file_name:
            print("[*]正在保存文章<%s>..." % title.encode("utf-8"))
            for each_txt in content_txt:
                self.save_text(title, each_txt.encode("utf-8"))

    def save_text(self, title, each_txt):
        # print("[*]正在保存文章...")
        file_name = title + ".txt"
        with open(file_name, "ab") as f:
            f.write(each_txt + "\r\n")
        # print("[*]保存文章%s完成!!!" % file_name.encode("utf-8"))


class CrawThread(threading.Thread):
    def __init__(self, name, page_queue, each_page_queue, header):
        super(CrawThread, self).__init__(name=name)
        self.page_queue = page_queue
        self.headers = header
        self.each_page_queue = each_page_queue

    def run(self):
        print("启动%s" % self.name)
        try:
            page = self.page_queue.get(False)
            full_url = "https://9song.me/cy/%e6%a0%a1%e5%9c%92%e5%ad%b8%e7%94%9f/page/" + str(page)
            html = requests.get(full_url, self.headers).text
            self.each_page_queue.put(html)
        except:
            pass
        print("结束%s" % self.name)


class ParseThread(threading.Thread):
    def __init__(self, thread_name, data_queue, mylock):
        super(ParseThread, self).__init__(name=thread_name)
        self.lock = mylock
        self.data_queue = data_queue

    def run(self):
        print("启动%s" % self.name)
        """这里的html是每一篇文章的页面的html源码"""
        html = self.data_queue.get()
        self.handle(html)

    def handle(self, html):
        file_name = etree.HTML(html).xpath('//h1/text()')
        content_txt = etree.HTML(html).xpath('//div[@class="entry-inner"]//p/text()')
        for title in file_name:
            print("[*]正在保存文章<%s>..." % title.encode("utf-8"))
            for each_txt in content_txt:
                self.save_text(title, each_txt.encode("utf-8"))

    def save_text(self, title, each_txt):
        file_name = title + ".txt"
        with open(file_name, "ab") as f:
            f.write(each_txt + "\r\n")
        print("[*]保存文章%s完成!!!" % file_name.encode("utf-8"))


class CrawEachThread(threading.Thread):
    def __init__(self, thread_name, each_page_queue, data_queue, header):
        super(CrawEachThread, self).__init__(thread_name)
        self.each_page_queue = each_page_queue
        self.data_queue = data_queue
        self.header = header

    def run(self):
        print("启动%s" % self.name)
        try:
            html = self.each_page_queue.get(False)
            self.handle(html)
        except:
            pass
        print("结束%s" % self.name)

    def handle(self, html):
        path_list = etree.HTML(html).xpath('//h2//a/@href')
        for section_url in path_list:
            each_html = requests.get(section_url)
            self.data_queue.put(each_html)


if __name__ == '__main__':

    """基本信息"""
    url = "https://9song.me/cy/%e6%a0%a1%e5%9c%92%e5%ad%b8%e7%94%9f/page/"
    ag_header = {
        "Host": " 9song.me",
        "Connection": " keep-alive",
        # "Cache-Control" : " max-age=0",
        "Upgrade-Insecure-Requests": " 1",
        # "User-Agent" : " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        # "User-Agent" : "Mozilla/5.0(compatible;MSIE 9.0; Windows NT6.1;Trident/5.0)",
        "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": " zh-CN,zh;q=0.9"
    }

    mylock = threading.Lock()

    """所需队列"""
    page_queue = Queue.Queue()
    data_queue = Queue.Queue()
    each_page_queue = Queue.Queue()

    """用户输入请求的指定页面"""
    start_page = int(raw_input("起始页:"))
    end_page = int(raw_input("结束页:"))

    """先put需要爬取的页数"""
    for page in range(start_page, end_page + 1):
        page_queue.put(page)

    """启动采集线程..."""
    craw_list = ["craw-1", "craw-2", "craw-3"]
    craw_thread = []

    for thread_name in craw_list:
        thread = CrawThread(thread_name, page_queue, each_page_queue, ag_header)
        thread.start()
        craw_thread.append(thread)

    """获取每一页的url"""
    craw_each_page = ["each-1", "each-2", "each-3"]
    craw_each_thread = []

    for thread_name in craw_each_page:
        thread = CrawEachThread(thread_name, each_page_queue, data_queue, ag_header)
        thread.start()
        craw_each_thread.append(thread)

    """启动解析线程..."""
    parse_list = ["parse-1", "parse-2", "parse-3"]
    parse_thread = []

    """需要的初始化参数不同"""
    for parse_name in parse_list:
        thread = ParseThread(parse_name, data_queue, mylock)
        thread.start()
        parse_thread.append(thread)

        # print("[*]正在爬取此分类下第%d页..." % page)
        # sexy = Spider(full_url, ag_header)
        # sexy.load_page()

你可能感兴趣的:(9songSpider)