python 写的一款百度关键字url多线程采集

# coding: UTF-8
import urllib
import re
import sys
import os
import threading
from time import ctime
#sys.stderr = None

#获取每页URL,返回当前页URL 列表
def getUrl(url):
    page = urllib.urlopen(url)
    srcHtml = page.read()
    
    reg = '<span class="g">([a-zA-Z0-9<b></b>\._-]+\.[a-zA-Z]{2,6})/.*?</span>'
    aListUrl = re.findall(reg,srcHtml)
    
    for index in range(len(aListUrl)): 
           reg = '[<b>|</b>]'
           ok = re.sub(reg,'',aListUrl[index])
           if ok:
               aListUrl[index] = ok
    return aListUrl

#拼装所有请求页URL地址 "Codier's Blog" "30"
def getPage(search,num,threadNum):
    pageList = []
    for index in range(0,num):
        strBuff = "http://www.baidu.com/s?wd="+search+"&pn="
        nowPage = index*10
        strBuff = strBuff + str(nowPage)
        pageList.append(strBuff)
    return pageList,threadNum

#封装多线程对象列表
def creMultiThread(allPageList,threadNum=2):
    if len(allPageList) < 100:#如果页数少于10页,则不使用多线程
        getPageListUrl(allPageList)
    else:
        threads = []
        nextpage = ceil(len(allPageList)/(10*threadNum))
        for index in range(0,threadNum):
            threads.append(threading.Thread(target=getPageListUrl,args=(pageList[nextpage*index*10:nextpage*(index+1)*10],)))
            for thd in threads:
                thd.setDaemon(True)
                thd.start()
                    
            for thd in threads:
                thd.join()
resultList = []
def getPageListUrl(pageList):
    global resultList
    for index in pageList:
        for index2 in getUrl(index):
            print index2
            resultList.append(index2)

#把每页采集到的URL写入文件
def putFile(allUrl):
    if os.path.exists("url.txt"):
        print u"己删除历史记录,将重新生成采集url...."
        os.remove("url.txt")
    for index in allUrl:
        file = open("url.txt","a+")
        file.write(index+"\n")
        file.close()
#多线程跑起来
if __name__ == '__main__':
    if len(sys.argv) == 4:
        list = getPage(str(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]))
        st = ctime()
        creMultiThread(list[0],list[1])
        result = set(resultList)
        print u'''
        -----------------------------------------------------
                    多线程百度URL采集器 python:2.7.10
        -----------------------------------------------------
                                                                                    
        ^v^ example:geturl.pyc "关键字"  "页数" "线程数" ^v^  
        ^v^    team:www.secbox.cn                        ^v^ 
            codier_qq:510623849
        -----------------------------------------------------
        '''
        putFile(result)
        print u"开始时间:"+st
        print u"总共采集"+str(len(resultList))+u"条url........."
        print u"去除重复剩余"+str(len(result))+u"条url........."
        print u"采集结果已经自动保存当前目录 url.txt 文件中..... ^V^"
        print u"结束时间:"+ctime()      
    elif len(sys.argv) < 4:
        print u'''
            -----------------------------------------------------
                          多线程百度URL采集器 python:2.7.10
            -----------------------------------------------------
                                                                           
               ^v^ example:geturl.pyc "关键字"  "页数" "线程数" ^v^  
               ^v^    team:www.secbox.cn                        ^v^ 
                 codier_qq:510623849
            -----------------------------------------------------
            '''


你可能感兴趣的:(python)