【Python爬虫】51查号 保存至MongoDB

一共创建了三个表 yidong dianxin liantong。用来分别保存所爬取的数据

# coding:utf-8
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
from multiprocessing.dummy import Pool as ThreadPool
from pymongo import MongoClient


home_pageUrl='http://www.51hao.cc/'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
headers = {'User-Agent': user_agent}
all_cityUrl="http://www.51hao.cc/all.html"
cityList=[]
phoneNumber={}

def download(url):
    try:
        r = requests.get(url,headers=headers)
        r.encoding = 'gb2312'   #设置编码,不设置中文会乱码
        return r
    except RequestException as e:
        print("The problem is {}!".format(e))

def getCity(url):
    r = download(url)
    Soup = BeautifulSoup(r.text,'lxml')

    for province in Soup.find_all('a',href=re.compile(r'city/\w+$')):
        print(province.text)
        for city in Soup.find_all('a',href=re.compile(province['href']+r'/\w+.php')):
            print(city.text,city['href'])

            cityList.append(home_pageUrl+city['href'])

    p.map(getNumber,cityList)

def getNumber(url):
    r=download(url)
    Soup=BeautifulSoup(r.text,'lxml')
    title =Soup.select('div[class~=title] > span')[0].text
    ProvincePattern=re.compile(u"[\u4e00-\u9fa5]+")     #用来匹配省份
    Province_City=title[:-10]
    Province=re.search(ProvincePattern,Province_City).group().strip()
    City=Province_City[4:].strip()
    for cuc in Soup.find_all('div',class_='ab_menu cuc'):
        cucList=cuc.find_next('ul')
        for num in cucList.find_all('a',href=re.compile(r'../../mobile/')):
            db.liantong.insert_one({'号码':num.text,'省/直辖市':Province,'市':City,'运营商':'中国联通'})
    for ctc in Soup.find_all('div',class_='ab_menu ctc'):
        ctcList=ctc.find_next('ul')
        for num in ctcList.find_all('a',href=re.compile(r'../../mobile/')):
            db.dianxin.insert_one({'号码': num.text, '省/直辖市': Province, '市': City, '运营商': '中国电信'})
    for cm in Soup.find_all('div',class_='ab_menu cm'):

        cmList=cm.find_next('ul')
        for num in cmList.find_all('a',href=re.compile(r'../../mobile/')):
            db.yidong.insert_one({'号码': num.text, '省/直辖市': Province, '市': City, '运营商': '中国移动'})
            

client=MongoClient()
db=client.PhoneNumber
p=ThreadPool(4)
getCity(all_cityUrl)
p.close()
p.join()
# print(cityList)

【Python爬虫】51查号 保存至MongoDB_第1张图片
2.png
【Python爬虫】51查号 保存至MongoDB_第2张图片
3.png

你可能感兴趣的:(【Python爬虫】51查号 保存至MongoDB)