使用正则表达式进行网页数据的提取,需要注意的是对数据周边信息的确定,如何才能用较少的正则表达式确定唯一的数据是我们需要解决的问题。
另外,在有些网站进行爬取的时候,在请求头中必须要加上cookie,否则服务器不予返回信息。
下面是对瓜子二手车的不同城市的车辆信息数据进行的爬取。
其中在数据提取时用到了json的解析与正则表达式。
import requests,re,os,json,pymysql
"""获取城市名称"""
def get_city_msg():
#设置headers
headers = {
'Host': 'www.guazi.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.guazi.com/nanchong/',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Cookie': 'xxxxx'
}
base_url = 'https://www.guazi.com/bj/?act=ajaxGetOpenCity'
response = requests.get(url=base_url, headers=headers)
contents = response.content.decode('utf-8')
#解码json数据
data = json.loads(contents)
city_msg_dict = data['data']['cityList']['all']
city_list = []
for shouzimu,li in city_msg_dict.items():
city_dict = {}
for city in li:
city_dict['城市名'] = city['name']
city_dict['简写'] = city['domain']
city_list.append(city_dict)
return city_list
"""获取车辆信息"""
def guazi():
headers = {
'Host': 'www.guazi.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.guazi.com/nanchong/buy/',
'Connection': 'keep-alive',
'Cookie': 'xxxxxx'
}
city_list = get_city_msg()
for city_dict in city_list:
city=city_dict['简写']
print('正在爬取' + city_dict['城市名'] + '的相关信息')
for page in range(1,51):
base_url = 'https://www.guazi.com/{}/buy/o{}/#bread'.format(city,page)
response = requests.get(url=base_url,headers=headers)
contents = response.content.decode('utf-8')
# with open('guazi.html','w',encoding='utf-8')as f:
# f.write(contents)
# print(contents)
car_list_rule = re.compile(r'(.*?) ',re.S)
car_list = car_list_rule.findall(contents)
# print(len(car_list))
# 价格,描述,年限,公里数,图片
car_obj_list = []
for car_msg in car_list:
car_dict = {}
# 现价
car_price_rule = re.compile(r'.*?(.*?)(.*?).*?
',re.S)
car_price = car_price_rule.findall(car_msg)
# print(car_price)
car_dict['现价'] = car_price[0][0]+car_price[0][1]
# 原价
car_reprice_rule = re.compile(r'.*?(.*?).*?', re.S)
car_reprice = car_reprice_rule.findall(car_msg)
if car_reprice:
# print(type(car_reprice))
car_dict['原价'] = car_reprice[0]
else:
car_dict['原价'] = '未知'
#描述
car_desc_rule = re.compile(r'(.*?)
',re.S)
car_desc = car_desc_rule.findall(car_msg)[0]
car_dict['描述'] = car_desc
#年限
car_live_year_rule = re.compile(r'(.*?)',re.S)
car_live_year = car_live_year_rule.findall(car_msg)[0]
car_dict['生产日期'] = car_live_year
#公里数
car_run_miles_rule = re.compile(r'.*?.*?(.*?)<',re.S)
car_run_miles = car_run_miles_rule.findall(car_msg)
# print(car_run_miles)
car_dict['公里数'] = car_run_miles[0]
#图片
car_image_rule = re.compile(r')
car_image = car_image_rule.findall(car_msg)[0]
car_dict['图片'] = car_image
car_obj_list.append(car_dict)
for car_obj in car_obj_list:
# print(car_obj)
for k, v in car_obj.items():
with open('瓜子二手车.txt', 'a', encoding='utf-8')as f:
f.write(k + ':' + v + '\n')
with open('瓜子二手车.txt', 'a', encoding='utf-8')as f:
f.write('\n')
with open('瓜子二手车.txt', 'a', encoding='utf-8')as f:
f.write(city_dict['城市名'] + '\n')
print('爬取完成。。')
if __name__ == '__main__':
if os.path.exists('瓜子二手车.txt'):
os.remove('瓜子二手车.txt')
guazi()