爬取目标:
https://www.gsmchoice.com/zh-cn/catalogue/
爬取层级:3
爬取信息:各品牌、各型号手机规格数据
3.9.7
4.10.0
0.0.1
2021.5.30
2.0.6
3.2
4.6.3
21.2.4
2.26.0
57.4.0
2.2.1
1.26.7
爬取要爬取的三级url
'''
* @Description: 爬取 www.gsmchoice.com 三级网页的手机信息
* @Param: url level info
* @return: phone_info
* @Author: [email protected]
* @Date: 2021-09-22
* 三级页面规则比较明显,不使用re库
'''
import random
import re
import requests
from bs4 import BeautifulSoup
'''
in : 一级url
out : 二级url https://www.gsmchoice.com/zh-cn/catalogue/nec/
'''
def craw_lev1(base_url, url):
li = []
req_headers= dict()
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
req_headers['User-Agent'] = random.choice(user_agent_list)
req_obj = requests.get(url,headers=req_headers)
bresp = BeautifulSoup(req_obj.text,'lxml')
CatalogueBrands = bresp.find(id='CatalogueBrands')
a = CatalogueBrands.find_all('a')
for item in a:
if ("https" in item['href']):
# 确认没有重复的框架内href没有重复的,一层url不去重直接追加
li.append(item['href'])
else:
li.append(base_url + item['href'])
return li
'''
in : 二级url
out : 三级url https://www.gsmchoice.com/zh-cn/catalogue/nec/mediasxn06e/
'''
def craw_lev2(url):
soup_a = []
base_url3 = []
base_url = "https://www.gsmchoice.com/"
factory = url.split('/')[-3]
reg_key = 'href="/zh-cn/catalogue/' + factory + '/\w*'
req_obj = requests.get(url)
soup = BeautifulSoup(req_obj.text,'html.parser')
soup_len = len(soup.find_all('div',class_='phone-container phone-container--left'))
if soup_len == 2:
soup_a = soup.find_all('div',class_='phone-container phone-container--left')[0].find_all('a')+soup.find_all('div',class_='phone-container phone-container--left')[1].find_all('a')
else:
soup_a = soup.find_all('div',class_='phone-container phone-container--left')[0].find_all('a')
for i in soup_a:
reg = re.compile(reg_key)
x = reg.findall(str(i))[0]
base_url3.append(base_url + str(x).split('"/')[1])
return base_url3
def page_num(u):
req_obj = requests.get(u)
soup = BeautifulSoup(req_obj.text,'html.parser')
b = soup.find_all('b')
num = re.findall("\d+",str(b[-3]))[0]
return num
if __name__ == '__main__':
base_url = "https://www.gsmchoice.com"
url_lev1 = "https://www.gsmchoice.com/zh-cn/catalogue/"
#410个品牌
url_lev2 = craw_lev1(base_url,url_lev1)
# #check每一二级页面的手机个数
# print (craw_lev1(base_url,url)[i],page_num(craw_lev1(base_url,url)[i]))
#拿二级(手机品牌)分页 取三级(手机品牌-手机型号)
with open("/Users/zjk/IdeaProjects/test_Python/resource/craw_results.txt",'a' ,encoding="utf-8") as file:
for iu in url_lev2:
url_lev3 = []
intn = int(page_num(iu))
if intn%40 == 0:
n = intn//40
else:
n = intn//40 + 1
#爬取 二级分页的三级url
for x in range(0,n):
# real_url = https://www.gsmchoice.com/zh-cn/catalogue/huawei/models/80
real_url = iu + "models/" + str(x*40)
staus_code = requests.get(real_url).status_code
url_lev3 += craw_lev2(real_url)
print(str(staus_code)+"-成功爬取:"+real_url)
for m in url_lev3:
file.write(m+"\n")
遍历文件内三级url,爬取要的手机信息
【单线程】
#coding:utf-8
'''
* @Description: 爬取指定单页url的 数据 分行存储\t\t\t分割
* @Param: url_lev3
* @return: 每一种手机模型数据
* @Author: [email protected]
* @Date: 2021-09-23
'''
import random
import re
from multiprocessing.pool import ThreadPool
import requests
import unicodedata
from bs4 import BeautifulSoup
# import logging
# logging.captureWarnings(True)
from requests.packages import urllib3
urllib3.disable_warnings()
def get_soup(url_lev3):
soup_one = "null"
soup_two = "null"
real_sout_li = []
req_headers= dict()
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
req_headers['User-Agent'] = random.choice(user_agent_list)
#源html提取 [soup_1,soup_2]
req_obj = requests.get(url_lev3,headers=req_headers)
req_obj.encoding = req_obj.apparent_encoding
soup = BeautifulSoup(req_obj.text,'lxml')
soup_t = soup.find_all(class_='PhoneData YesDict')
real_sout_li.append(str(soup_t[0]))
for sou in soup_t:
html_text = str(sou)
if '加速度计' in html_text:
real_sout_li.append(html_text)
#html转soup对
soup_one = BeautifulSoup(real_sout_li[0],'lxml')
if len(real_sout_li) == 1:
return [soup_one,'null']
soup_two = BeautifulSoup(real_sout_li[1],'lxml')
return [soup_one,soup_two]
def craw_cell1(soup_1):
#分别 正则提取 替换
#key1
item = re.sub(r'\t*|\n*|\[|\]','',unicodedata.normalize('NFKC', str(soup_1.find_all(class_='phoneCategoryName')).replace('\xa0','')))
key_item = str(item).replace('','').replace(' ','')
key_li = key_item.split(', ')
#value1
item_v = re.sub(r'\t*|\n*|\[|\]','',str(soup_1.find_all(class_='phoneCategoryValue')))
item_v_li = item_v.split(', in item_v_li[index]:
item_v_li[index] = re.sub('(.*)ue">||||\xa0','',item_v_li[index].replace('
',' '))
elif (' in item_v_li[index]) & (' in item_v_li[index]):
ss = re.sub(r'.* ','',item_v_li[index])
cont = ss.split('">')[1].split('')[0]
if 'tick' in item_v_li[index]:
item_v_li[index] = 'yes '+ cont
elif 'cross' in item_v_li[index]:
item_v_li[index] = 'no '+ cont
else:
item_v_li[index] = 'unknown'
elif ' in item_v_li[index]:
item_v_li[index] = item_v_li[index].split(')[0].split('">')[-1]
elif ' in item_v_li[index]:
mark = item_v_li[index].split(')[1].split('">')[0]
if mark=='tick':
item_v_li[index] = 'yes'
elif mark=='question':
item_v_li[index] = 'unknown'
else:
item_v_li[index] = 'no'
elif '' in item_v_li[index]:
item_v_li[index] = item_v_li[index].split('')[0].split('">')[-1]
else:
item_v_li[index] = item_v_li[index].split('">')[-1].replace('
',' ')
#统一处理 漏筛的脏字符
item_v_li[index] = re.sub(r'||
| ','')
if len(key_li) == len(item_v_li):
res_li = {
}
for ind in range(0,len(key_li)):
res_li[key_li[ind]] = item_v_li[ind]
return res_li
def craw_cell2(soup_2):
res_li = {
}
sub = re.sub(r'\t*|\n*|\xa0', '', str(soup_2))
findall_key = re.findall(r'me">(.+?)
TODO:【封装+线程池】
def get_soup(url_lev3):...
def craw_cell1(soup_1):...
def craw_cell2(soup_2):...
def thread_job(url):
path_ = 'D:\\Py_T\\resources\\result_2021.txt'
if 200 == requests.get(url).status_code:
with open(path_, 'a', encoding="utf-8") as file_:
print('开始爬取: '+url)
r_a = craw_cell1(get_soup(url)[0])
r_b = craw_cell2(get_soup(url)[1])
result = dict(list(r_a.items()) + list(r_b.items()))
#${result}
#{'牌子': 'Samsung Smartphones', '模型': 'Galaxy M52', '手机其他名称': 'SM-M526B/DS, SM-M526BR/DS', 'Standardy': 'GSM, UMTS, LTE, 5G', 'GSM frequencies': '850 900 1800 1900 ', 'UMTS标准': '850 900 1700 1900 2100 ', 'Standard LTE': 'FDD LTE: 2100, 1800, 900, 850 TDD LTE: 2600, 2500, 2300, 1900', 'Standard 5G': 'yes', '手机规格': '触控手机', '防水性和防尘性': 'IP67', '大小': '164.60 x 76.90 x 8.20 mm', 'Weight': '176.00 g', 'Display': '彩色 / Super AMOLED 16M 颜色,\xa0120 Hz 1080 x 2400 px\xa0(6.70")\xa0393 ppi ∼85.6% screen-to-body ratio', '显示保护': 'Corning Gorilla Glass 5', '通话最长时间': 'unknown', '待机最长时间': 'unknown', '标准电池': 'Li-Ion 5000 mAh', '快速充电': 'Fast Charging ', '无线充电': 'no', '手机存储': '128 GB, 256 GB', '随机存取存储 (RAM)': '6 GB, 8 GB', 'Memory cards': 'yes', 'Operating system': 'Android 11', '接口': 'One UI 3.1', '处理器': 'Qualcomm Snapdragon 778G Processor clock: 2.40 GHz 芯的数目: 8 GPU: Adreno 642L ', 'Touchscreen': 'yes', '双SIM卡': 'yes', 'SIM卡标准': 'nanoSIM, nanoSIM', '卡双模式': 'dual standby', '混合双卡双待': 'nanoSIM, microSD', '发行日期': '第 3 季 2021', '加速度计': 'yes', '接近传感器': 'no', '光传感器': 'yes', '磁力仪': 'yes', '陀螺仪': 'yes', '晴雨表': 'no', '高度表': 'no', '重力感应器': 'yes', '霍尔效应传感器': 'yes', '虹膜扫描仪': 'no', '指纹采集仪': 'yes', '温度计': 'no', '湿度计': 'no'}
sp_sig = '_signal_'
strs = result.get('牌子','未知') \
+ sp_sig + result.get('模型','未知') \
+ sp_sig + result.get('手机其他名称','未知') \
+ sp_sig + result.get('Standardy','未知') \
+ sp_sig + result.get('GSM frequencies','未知') \
+ sp_sig + result.get('UMTS标准','未知') \
+ sp_sig + result.get('Standard LTE','未知') \
+ sp_sig + result.get('Standard 5G','未知') \
+ sp_sig + result.get('手机规格','未知') \
+ sp_sig + result.get('防水性和防尘性','未知') \
+ sp_sig + result.get('大小','未知') \
+ sp_sig + result.get('Weight','未知') \
+ sp_sig + result.get('Display','未知') \
+ sp_sig + result.get('显示保护','未知') \
+ sp_sig + result.get('通话最长时间','未知') \
+ sp_sig + result.get('待机最长时间','未知') \
+ sp_sig + result.get('标准电池','未知') \
+ sp_sig + result.get('快速充电','未知') \
+ sp_sig + result.get('无线充电','未知') \
+ sp_sig + result.get('手机存储','未知') \
+ sp_sig + result.get('随机存取存储 (RAM)','未知') \
+ sp_sig + result.get('Memory cards','未知') \
+ sp_sig + result.get('Operating system','未知') \
+ sp_sig + result.get('接口','未知') \
+ sp_sig + result.get('处理器','未知') \
+ sp_sig + result.get('Touchscreen','未知') \
+ sp_sig + result.get('双SIM卡','未知') \
+ sp_sig + result.get('SIM卡标准','未知') \
+ sp_sig + result.get('卡双模式','未知') \
+ sp_sig + result.get('混合双卡双待','未知') \
+ sp_sig + result.get('发行日期','未知') \
+ sp_sig + result.get('加速度计','未知') \
+ sp_sig + result.get('接近传感器','未知') \
+ sp_sig + result.get('光传感器','未知') \
+ sp_sig + result.get('磁力仪','未知') \
+ sp_sig + result.get('陀螺仪','未知') \
+ sp_sig + result.get('晴雨表','未知') \
+ sp_sig + result.get('高度表','未知') \
+ sp_sig + result.get('重力感应器','未知') \
+ sp_sig + result.get('霍尔效应传感器','未知') \
+ sp_sig + result.get('虹膜扫描仪','未知') \
+ sp_sig + result.get('指纹采集仪','未知') \
+ sp_sig + result.get('温度计','未知') \
+ sp_sig + result.get('湿度计','未知') \
+ sp_sig + url
file_.write(strs)
file_.flush()
print('成功爬取: '+url)
if __name__ == '__main__':
_path = 'D:\\Py_T\\resources\\craw_results.txt'
#设置线程并行
#遍历url 爬取
urls = []
with open(_path,'r' ,encoding="utf-8") as _file:
for url in _file:
urls.append(url)
_file.close()
pool = Pool(processes=10)
result = pool.map(thread_job, urls)
pool.close() # 关闭进程池,不再接受新的进程
pool.join() # 主进程阻塞等待子进程的退出
分割符:
_signal_
Samsung Smartphones_signal_Galaxy M52_signal_SM-M526B/DS, SM-M526BR/DS_signal_GSM, UMTS, LTE, 5G_signal_850 900 1800 1900 _signal_850 900 1700 1900 2100 _signal_FDD LTE: 2100, 1800, 900, 850 TDD LTE: 2600, 2500, 2300, 1900_signal_yes_signal_触控手机_signal_IP67_signal_164.60 x 76.90 x 8.20 mm_signal_176.00 g_signal_彩色 / Super AMOLED 16M 颜色, 120 Hz 1080 x 2400 px (6.70") 393 ppi ∼85.6% screen-to-body ratio_signal_Corning Gorilla Glass 5_signal_unknown_signal_unknown_signal_Li-Ion 5000 mAh_signal_Fast Charging _signal_no_signal_128 GB, 256 GB_signal_6 GB, 8 GB_signal_yes_signal_Android 11_signal_One UI 3.1_signal_Qualcomm Snapdragon 778G Processor clock: 2.40 GHz 芯的数目: 8 GPU: Adreno 642L _signal_yes_signal_yes_signal_nanoSIM, nanoSIM_signal_dual standby_signal_nanoSIM, microSD_signal_第 3 季 2021_signal_yes_signal_no_signal_yes_signal_yes_signal_yes_signal_no_signal_no_signal_yes_signal_yes_signal_no_signal_yes_signal_no_signal_no_signal_https://www.gsmchoice.com/zh-cn/catalogue/samsung/galaxym52
Samsung Smartphones_signal_Galaxy M22_signal_SM-M225FV/DS, SM-M225F/DS_signal_GSM, UMTS, LTE_signal_850 900 1800 1900 _signal_850 900 1900 2100 _signal_FDD LTE: 2600, 2100, 1800, 900, 850, 800 TDD LTE: 2600, 2500, 2300_signal_no_signal_触控手机_signal_未知_signal_159.90 x 74.00 x 8.40 mm_signal_186.00 g_signal_彩色 / Super AMOLED 16M 颜色, 90 Hz 720 x 1600 px (6.40") 403 ppi ∼83.6% screen-to-body ratio_signal_no_signal_4G: 2280 分钟 (38.0 钟头)_signal_unknown_signal_Li-Ion 5000 mAh_signal_Fast Charging _signal_no_signal_128 GB_signal_4 GB_signal_yes_signal_Android 10_signal_One UI _signal_MediaTek Helio G80 MT6769T Processor clock: 2.00 GHz 芯的数目: 8 GPU: ARM Mali-G52 MC2 _signal_yes_signal_yes_signal_nanoSIM, nanoSIM_signal_dual standby_signal_unknown_signal_第 3 季 2021_signal_yes_signal_yes_signal_yes_signal_yes_signal_yes_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_https://www.gsmchoice.com/zh-cn/catalogue/samsung/galaxym22
Samsung Smartphones_signal_Galaxy F42 5G_signal_SM-E426B/DS_signal_GSM, UMTS, LTE, 5G_signal_850 900 1800 1900 _signal_850 900 1700 1900 2100 _signal_FDD LTE: 2600, 2300, 1900, 1800, 1700, 850, 800, 700_signal_yes_signal_触控手机_signal_未知_signal_167.20 x 76.40 x 9.00 mm_signal_203.00 g_signal_彩色 / IPS TFT 16M 颜色, 60 Hz 1080 x 2400 px (6.60") 399 ppi ∼82.3% screen-to-body ratio_signal_no_signal_unknown_signal_unknown_signal_Li-Ion 5000 mAh_signal_Fast Charging _signal_no_signal_64 GB, 128 GB_signal_4 GB, 6 GB, 8 GB_signal_yes_signal_Android 11_signal_One UI 3.1_signal_MediaTek Dimensity 700 MT6833 Processor clock: 2.20 GHz 芯的数目: 8 GPU: ARM Mail-G57 MC2 @955 Mhz _signal_yes_signal_yes_signal_nanoSIM, nanoSIM_signal_dual standby_signal_no_signal_第 2 季 2021_signal_yes_signal_yes_signal_no_signal_yes_signal_no_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_https://www.gsmchoice.com/zh-cn/catalogue/samsung/galaxyf425g
Samsung Smartphones_signal_Galaxy Wide5_signal_SM-E426S_signal_GSM, UMTS, LTE, 5G_signal_850 900 1800 1900 _signal_850 900 1700 1900 2100 _signal_FDD LTE: 2600, 2100, 1900, 1800, 1700, 900, 850, 800, 700 TDD LTE: 2600, 2500, 2300_signal_TDD 5G: 3500_signal_触控手机_signal_未知_signal_167.20 x 76.40 x 9.10 mm_signal_207.00 g_signal_彩色 / IPS TFT 16M 颜色, 90 Hz 1080 x 2408 px (6.60") 400 ppi ∼82.1% screen-to-body ratio_signal_no_signal_unknown_signal_unknown_signal_Li-Ion 5000 mAh_signal_Fast Charging _signal_no_signal_128 GB_signal_6 GB_signal_yes_signal_Android 11_signal_One UI 3.1_signal_MediaTek Dimensity 700 MT6833 Processor clock: 2.20 GHz 芯的数目: 8 GPU: ARM Mail-G57 MC2 @955 Mhz _signal_yes_signal_no_signal_nanoSIM_signal_未知_signal_未知_signal_第 2 季 2021_signal_yes_signal_yes_signal_yes_signal_yes_signal_yes_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_https://www.gsmchoice.com/zh-cn/catalogue/samsung/galaxywide5
Samsung Smartphones_signal_Galaxy Buddy_signal_SM-A226L_signal_GSM, UMTS, LTE, 5G_signal_850 900 1800 1900 _signal_850 900 1700 1900 2100 _signal_FDD LTE: 2600, 2300, 1900, 1800, 1700, 850, 800, 700_signal_yes_signal_触控手机_signal_未知_signal_167.20 x 76.40 x 9.10 mm_signal_206.00 g_signal_彩色 / IPS TFT 16M 颜色, 60 Hz 1080 x 2408 px (6.60") 399 ppi ∼82.1% screen-to-body ratio_signal_no_signal_unknown_signal_unknown_signal_Li-Ion 5000 mAh_signal_Fast Charging _signal_no_signal_64 GB, 128 GB_signal_4 GB, 6 GB, 8 GB_signal_yes_signal_Android 11_signal_One UI 3.1_signal_MediaTek Dimensity 700 MT6833 Processor clock: 2.20 GHz 芯的数目: 8 GPU: ARM Mail-G57 MC2 @955 Mhz _signal_yes_signal_no_signal_nanoSIM_signal_未知_signal_未知_signal_第 2 季 2021_signal_yes_signal_yes_signal_no_signal_yes_signal_no_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_yes_signal_no_signal_no_signal_https://www.gsmchoice.com/zh-cn/catalogue/samsung/galaxybuddy
Samsung Smartphones_signal_Galaxy M32 5G_signal_SM-A326B/DS_signal_GSM, UMTS, LTE, 5G_signal_850 900 1800 1900 _signal_850 900 1700 1900 2100 _signal_FDD LTE: 2600, 2100, 1900, 1800, 900, 850, 800, 700 TDD LTE: 2600, 2500, 2300_signal_FDD 5G: 2600, 2100, 1800, 1700, 900, 850, 800, 700 TDD 5G: 3500, 2600, 2500, 2300_signal_触控手机_signal_未知_signal_164.20 x 76.10 x 9.10 mm_signal_205.00 g_signal_彩色 / IPS TFT 16M 颜色, 60 Hz 720 x 1600 px (6.50") 270 ppi ∼81.6% screen-to-body ratio_signal_Corning Gorilla Glass 3_signal_4G: 2160 分钟 (36.0 钟头)_signal_unknown_signal_Li-Po 5000 mAh_signal_Fast Charging _signal_no_signal_64 GB, 128 GB_signal_4 GB, 6 GB, 8 GB_signal_yes_signal_Android 11_signal_One UI _signal_MediaTek Dimensity 720 MT6853 Processor clock: 2.00 GHz 芯的数目: 8 GPU: ARM Mali-G57 MC3 _signal_yes_signal_yes_signal_nanoSIM, nanoSIM_signal_dual standby_signal_nanoSIM, microSD_signal_第 1 季 2021_signal_yes_signal_yes_signal_yes_signal_yes_signal_yes_signal_no_signal_no_signal_yes_signal_yes_signal_no_signal