Python多线程

Python多线程浏览

  • 1、开发环境
    • Python版本
    • 需要手动安装的库
  • 2、源代码
  • 3、使用方法
    • 重要!!!使用前必看
  • 后续会上传可执行文件(.exe)以便没有安装开发环境的小伙伴使用。

1、开发环境

Python版本

使用Python3.8;

需要手动安装的库

requests;
pyquery;
安装方法不做赘述。在CMD中分别敲下面代码。

pip install request
pip install pyquery

2、源代码

import requests
from requests import RequestException
from pyquery import PyQuery as pq 
import time
import threading
import queue 
import random
queueLock = threading.Lock()
workQueue = queue.Queue(10)
exitFlag = 0
threads = []
urlList = ['https://blog.csdn.net/qq_41588845/article/details/106394775']
cookie = ['' , #填入cookie
            ''
]
Agent=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
       'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
       'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
       'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',

        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',

        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
       'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5',
        'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
        'Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
        'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) Apple'

       ]
 



#获取博主所有博客的属性信息。
def get_url(url):

    urlBuf = []    
    # 模拟浏览器,不用改,几乎固定
    headers = {
     
                'User-Agent': Agent[0],
                'cookie':cookie[0]
                              }
    
    # 模拟点击网址
    html = requests.get(url, headers=headers, verify=True).text
    # 解析代码
    doc = pq(html)
    #调试
    #print(doc)
    
    #点击CSDN右上角头像“我的博客”,在我的博客界面
    #右键查看源代码找到博客相关信息

    #获取class为article-item-box的代码
    a = doc('.article-item-box')
    
    for item in a.items():
        #调试
        #print(item)
        
        #从上面获取的代码中获取博客属性信息。
        #标题
        #一开始没准备爬标题和浏览量,本来应该用正则表达式
        #这里直接把相关信息按照标签">"拆开后在列表取出来
        title = item('.article-type')
        title = str(title)
        title = title.split('>')[-1]
        title = item('.article-type')
        title = str(title)
        title = title.split('>')[-1]
        title = title.replace(' ','')
        print('标题  : '+title)
        #浏览量
        readnum = item('.read-num')
        readnum = str(readnum)
        readnum = readnum.split('>')[2]
        readnum = readnum.split('<')[0]
        print('浏览量: '+readnum)
        #回车
        #print('\n')
        #URL
        b = item.find('a').attr('href')
        #把URL写到列表中
        urlBuf.append(b)
    print('\n')
    return urlBuf


#对URL进行访问
def get_page(url):
    try:
    	#每篇博客随机使用不同的浏览器访问
        #for a in Agent:
        for a in range(4):
        
            headers = {
     
			'Referer': 'https://blog.csdn.net',  # 伪装成从CSDN博客搜索到的文章
			'UserAgent':Agent[random.randint(0,len(Agent)-1)] , # 伪装成浏览器
                  
            'cookie':cookie[random.randint(0,len(cookie)-1)]

                    }
            response = requests.get(url, headers=headers)
              
                #return response.text
    	    
            #return None
    except RequestException:
        print('请求出错')
        return None	
class myThread (threading.Thread):
    def __init__(self, threadID, name, q):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.q = q
    #访问博客
    def fun(threadID):
        while True:
            for a in urlList:
           
                
                get_page(a)
                #打印浏览量信息
            get_url(url)

            #等待60~70s重新刷新
            
            time.sleep(random.randint(10,15))

        
        #print('博客 '+str(threadID)+' 已刷新')
        
    #线程任务
    def run(self):
 
        print ("开启线程:" + self.name)
        #执行fun
        myThread.fun(self.threadID) 
        process_data(self.name, self.q)
        print ("退出线程:" + self.name)
def process_data(threadName, q):
    while not exitFlag:
        queueLock.acquire()
        if not workQueue.empty():
            data = q.get()
            queueLock.release()
            print ("%s processing %s" % (threadName, data))
        else:
            queueLock.release()
        time.sleep(1)

#开启线程
def start(num):
    #默认开三线程
    #for i in range(0,num):
     for i in range(0,3):
        thread = myThread(i,i, workQueue)
        thread.start()
        threads.append(thread)
        time.sleep(0.5)
 #!!!修改为自己博客的网址!!!
url = 'https://blog.csdn.net/qq_41680042'        
def main():
   
    

    #先执行get_url获取一下博客的URL写到urllist中
    urlList.extend(get_url(url))
    #博客的数量等于URL的数量
    for a in urlList:
        print('\''+a+'\',')
    num = len(urlList)
    print('您一共有 '+str(num)+' 篇博客')
  
   

        #开启线程,执行访问任务。
    start(num)
        


       
if __name__ == '__main__':
	main()
	

3、使用方法

重要!!!使用前必看

需要修改的地方,把当前主页的网址复制到程序main函数上方把原程序中我的网址修改为你自己的就ok。

后续会上传可执行文件(.exe)以便没有安装开发环境的小伙伴使用。

你可能感兴趣的:(Python,python,多线程)