需求:爬取全国各地区邮政编码和区号
思路:1、从一级页面获取全国所有的省
2、开线程对各个省的数据分别爬取和解析同时保存到文件中
代码:
import re
import requests
import hashlib
import urllib
import time
import os
from os import path
from concurrent.futures import ThreadPoolExecutor
threadPool=ThreadPoolExecutor(30) #创建1个程池中,容纳线程个数为30个;
def getPageData(pageUrl):
#设置请求头,现在放爬虫太多了,请求头缺少数据会被禁
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
,'Cache-Control':'max-age=1'
,'Connection':'keep-alive'
,'Referer':'https://www.hao123.com/?tn=99480011_hao_pg'
,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
respose = requests.get(pageUrl,headers=headers)
if respose.status_code==200:
print("------成功--------"+pageUrl)
return respose.text
else:
print("------失败--------"+pageUrl+"----"+str(respose.status_code))
return respose.status_code
def getProviceData(provice):
fullUrl ='http://www.ip138.com/post/yzbm/%s'% provice
proviceName = provice.split(".")[0]
proviceContent = getPageData(fullUrl)
return {'name':proviceName,'content':proviceContent}
def parseFirstPageData(firstPageData):
proviceUrlList = list()
urls = re.findall(r'shape="poly".*?href="(.*?)"', firstPageData,re.S) # re.S 把文本信息转换成1行匹配
for url in urls:
if len(url) != 1:
if url not in proviceUrlList:
proviceUrlList.append(url)
return proviceUrlList
def parseSecondPageData(obj):
res = obj.result()
proviceName = res['name']
proviceContent = res['content']
print("------fileName--------"+proviceName);
if proviceContent == 503:
return
fileName = path.dirname('.')+"area\\"+proviceName
file = open(fileName, 'w',encoding='utf8')
tdList = re.findall(r'(.*?) ',proviceContent , re.I|re.M|re.S)
print("------tdList--------"+str(len(tdList)));
cityInfoList = list()
for tdItem in tdList:
if "div" in tdItem:
cityItemName = re.search(r'(.*)', tdItem).groups()[0]
cityItemNumber = re.findall(r'\d+',tdItem,re.S)
cityItem = cityItemName+","+cityItemNumber[0]+","+cityItemNumber[1]
cityInfoList.append(cityItem)
file.write(cityItem+'\n')
print(cityItem)
file.close()
def main():
#1、获取一级页面数据
#2、解析页面数据获取省数据
#3、开多线程爬取二级数据
#4、解析二级页面数据存入文件
firstPageUrl = "http://www.ip138.com/post/yzbm/yzbm.htm"
firstPageData = getPageData(firstPageUrl)
proviceList = parseFirstPageData(firstPageData)
print(len(proviceList))
for proviceItem in proviceList:
threadPool.submit(getProviceData,proviceItem).add_done_callback(parseSecondPageData)
if __name__ == '__main__':
main()
遇到的问题:1、页面做了对请求做了处理,不符合要求的请求直接503,所以需要设置请求头
2、页面数据解析的时候需要善于分析页面数据,分析出数据的唯一特性,实在没有的,要对数据进行过滤,具体场景选用具体的方法,正则真的很牛鼻,基本功很重要。
3、这个网站做的是真的用心,反爬虫做的整了好几套方案,进去的请求超过5次就禁,第二层还没想到好的方法,大佬们可以指点指点
最后补上一段将文件夹中数据读取组出来,同时重组成sql语句代码:
import os
from os import path
import time
def main():
writeFile = open(path.dirname('.')+"updateSql", 'w',encoding='utf8')
filePath = path.dirname('.')+'area\\'
fileList = os.listdir(filePath)
for fileItem in fileList:
readFile = open(filePath+fileItem,encoding='utf-8')
for lineStr in readFile.readlines():
lineStr = lineStr.strip()
itemArray = lineStr.split(",")
updateSqlStr = "UPDATE ml_mobile SET ml_p_areacode = '%s', ml_p_zip = '%s' WHERE ml_p_city = '%s';"%(itemArray[2],itemArray[1],itemArray[0])
writeFile.write(updateSqlStr+'\n')
readFile.close()
if __name__ == '__main__':
main()