妹子属性我TM就是对理工科的妹子毫无抵抗之力
需求是爬取安居客平台上南京的浦口和六合的新房房价。样例链接:https://nj.fang.anjuke.com/loupan/pukou/s2/
感觉没啥好总结的。用了BeautifulSoup4,这真是个方便玩意。
重点如下:
别的没啥了。如果当前页面展出的信息不够的话就点进标题获得更多信息
util.py:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
def get_lat_and_lng_by_url(url):
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"};
html = requests.get(url, headers=headers);
Soup = BeautifulSoup(html.text,'lxml');
javascript_tags = Soup.find_all("script");
for js_tag in javascript_tags:
if js_tag.get_text().find("lat: ") != -1:
text = js_tag.get_text();
lat_start = text.find("lat:");
lng_start = text.find("lng:");
lat_str = text[lat_start+5:lng_start-2];
lng_end = lng_start+10;
while text[lng_start+9:lng_end+1].isdigit():
lng_end = lng_end+1;
lng_str = text[lng_start+5:lng_end];
return [lat_str, lng_str];
return [];
pukouAndLiuheCrawler.py:
# -*- coding: utf-8 -*-
import requests;
from bs4 import BeautifulSoup;
import os;
import util;
import xlwt;
from util import get_lat_and_lng_by_url;
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"};
urls = ["https://nj.fang.anjuke.com/loupan/pukou/s2/"];
urls.append("https://nj.fang.anjuke.com/loupan/pukou/p2_s2/");
urls.append("https://nj.fang.anjuke.com/loupan/pukou/p3_s2/");
urls.append("https://nj.fang.anjuke.com/loupan/pukou/p4_s2/");
urls.append("https://nj.fang.anjuke.com/loupan/pukou/p5_s2/");
urls.append("https://nj.fang.anjuke.com/loupan/liuhe/s2/");
urls.append("https://nj.fang.anjuke.com/loupan/liuhe/p2_s2/");
urls.append("https://nj.fang.anjuke.com/loupan/liuhe/p3_s2/");
excel_lines = [["楼盘名","链接","地址","价格","纬度","经度"]];
for url in urls:
start_html = requests.get(url, headers=headers);
Soup = BeautifulSoup(start_html.text,'lxml');
item_mods = Soup.find_all('div',class_="item-mod");
#print(type(infos)) infos是bs4.element.ResultSet
for item_mod in item_mods:
#print(item_mod)
div_tag_class_infos = item_mod.select('div[class="infos"]');
a_tag_class_favor_pos = item_mod.select('a[class="favor-pos"]');
if div_tag_class_infos:
a_tag_lp_name = div_tag_class_infos[0].select('a[class="lp-name"]');
name = a_tag_lp_name[0].select("h3 > span")[0].get_text();
url = a_tag_lp_name[0]["href"];
address = div_tag_class_infos[0].select('a[class="address"]')[0].get_text();
price = a_tag_class_favor_pos[0].select(" p > span ")[0].get_text();
position = get_lat_and_lng_by_url(url);
lat = 0;
lng = 0;
if(position.__len__()>=2):
lat = position[0];
lng = position[1];
print("\n--------分鸽线---------");
print("楼盘名:"+name);
print("链接:"+url);
print("地址:"+address);
print("价格:"+price);
print("lat:"+lat);
print("lng:"+lng)
excel_lines.append([name,url,address,price,lat,lng]);
else :
continue;
print(excel_lines);
number_of_lines = excel_lines.__len__();
print("number of lines = "+str(number_of_lines));
wb = xlwt.Workbook(encoding="utf-8");
sheet = wb.add_sheet("mySheet", True);
i = 0;
while i < number_of_lines:
j = 0;
while j < 6:
sheet.write(i,j,excel_lines[i][j]);
j = j + 1;
i=i+1;
wb.save(r"C:\Users\yuanf\Desktop\pklh.xls");
print("End of python script reached.");
完结撒花(并不
他们的经纬度竟然是硬编码在js里的
复习了一下字符串切片操作,list操作和css选择器orz