python 多线程爬取数据

需求:爬取全国各地区邮政编码和区号

思路:1、从一级页面获取全国所有的省

           2、开线程对各个省的数据分别爬取和解析同时保存到文件中

代码:

import re
import requests
import hashlib
import urllib
import time 
import os
from os import path 
from concurrent.futures import ThreadPoolExecutor
threadPool=ThreadPoolExecutor(30) #创建1个程池中,容纳线程个数为30个;

def getPageData(pageUrl):
    #设置请求头,现在放爬虫太多了,请求头缺少数据会被禁
    headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    ,'Cache-Control':'max-age=1'
    ,'Connection':'keep-alive'
    ,'Referer':'https://www.hao123.com/?tn=99480011_hao_pg'
    ,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
    respose = requests.get(pageUrl,headers=headers) 
    if respose.status_code==200:
        print("------成功--------"+pageUrl)
        return respose.text
    else:
        print("------失败--------"+pageUrl+"----"+str(respose.status_code))
        return respose.status_code

def getProviceData(provice):
    fullUrl ='http://www.ip138.com/post/yzbm/%s'% provice 
    proviceName = provice.split(".")[0]
    proviceContent = getPageData(fullUrl)
    return {'name':proviceName,'content':proviceContent}

def parseFirstPageData(firstPageData): 
    proviceUrlList = list()
    urls = re.findall(r'shape="poly".*?href="(.*?)"', firstPageData,re.S)  # re.S 把文本信息转换成1行匹配
    for url in urls:
        if len(url) != 1:
            if url not in proviceUrlList: 
                proviceUrlList.append(url)
    return proviceUrlList

def parseSecondPageData(obj): 
    res = obj.result()
    proviceName = res['name']
    proviceContent = res['content']
    print("------fileName--------"+proviceName);
    if proviceContent == 503:
        return
    fileName = path.dirname('.')+"area\\"+proviceName
    file = open(fileName, 'w',encoding='utf8')
    tdList = re.findall(r'(.*?)',proviceContent , re.I|re.M|re.S)
    print("------tdList--------"+str(len(tdList)));
    cityInfoList = list()
    for tdItem in tdList: 
        if "div" in tdItem:
            cityItemName = re.search(r'(.*)', tdItem).groups()[0]
            cityItemNumber = re.findall(r'\d+',tdItem,re.S)
            cityItem = cityItemName+","+cityItemNumber[0]+","+cityItemNumber[1]
            cityInfoList.append(cityItem)
            file.write(cityItem+'\n')
            print(cityItem)
    file.close()

def main():
    #1、获取一级页面数据
    #2、解析页面数据获取省数据
    #3、开多线程爬取二级数据
    #4、解析二级页面数据存入文件
    firstPageUrl = "http://www.ip138.com/post/yzbm/yzbm.htm"
    firstPageData = getPageData(firstPageUrl) 
    proviceList = parseFirstPageData(firstPageData)
    print(len(proviceList))
    for proviceItem in proviceList:
        threadPool.submit(getProviceData,proviceItem).add_done_callback(parseSecondPageData)

if __name__ == '__main__':
    main()

遇到的问题:1、页面做了对请求做了处理,不符合要求的请求直接503,所以需要设置请求头

                     2、页面数据解析的时候需要善于分析页面数据,分析出数据的唯一特性,实在没有的,要对数据进行过滤,具体场景选用具体的方法,正则真的很牛鼻,基本功很重要。

                     3、这个网站做的是真的用心,反爬虫做的整了好几套方案,进去的请求超过5次就禁,第二层还没想到好的方法,大佬们可以指点指点

最后补上一段将文件夹中数据读取组出来,同时重组成sql语句代码:

import os 
from os import path 
import time

def main():
	writeFile = open(path.dirname('.')+"updateSql", 'w',encoding='utf8')
	filePath = path.dirname('.')+'area\\'
	fileList = os.listdir(filePath)
	for fileItem in fileList:
		readFile = open(filePath+fileItem,encoding='utf-8')
		for lineStr in readFile.readlines():
			lineStr = lineStr.strip() 
			itemArray = lineStr.split(",")
			updateSqlStr = "UPDATE ml_mobile SET ml_p_areacode = '%s', ml_p_zip = '%s' WHERE ml_p_city = '%s';"%(itemArray[2],itemArray[1],itemArray[0])
			writeFile.write(updateSqlStr+'\n')
		readFile.close()

if __name__ == '__main__':
	main()

 

你可能感兴趣的:(python)