从主页面获取连接
每个获取到的URL获取明细数据
数据库入数据库
获取数据框架
def gethtml(url): #获取网页信息
pass
def html_url_data(html,list_url): #处理一级页,获取网页URL
pass
def html_room_data(html,list_room_data): #处理二级网页。获取网页上数据
pass
def datasave(ls_data,table_name): #数据插入数据库
pass
def data_exists(table_name,url_no): #判断数据是否存在
pass
def geturl(start_url,list_url,depth=5,end_url='/'):#获取URL处理流程
url=start_url+str(i)+end_url
html=gethtml(url)
html_url_data(html,list_url)
def get_new_room():#获取新房源数据的主流程
geturl(start_url,list_url)
for room_url in list_url:
if data_exists(table_name,url_no)==None:
html=gethtml(room_url)
html_room_data(html,list_room_data)
datasave(list_room_data,table_name)
def get_old_room():#获取旧房源数据主流程
pass
def get_old_url(start_url,list_url,depth,end_url): #获取url
pass
def html_old_hourse_data(html,list_data):#获取数据
pass
def main():
get_new_room()
get_old_room()
main()
获取数据完整代码
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pymysql
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def gethtml(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
try:
r=requests.get(url,timeout=30,headers=headers,verify=False)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
print('gethtml_error',url)
return ''
def html_url_data(html,list_url):
demo=BeautifulSoup(html,'html.parser')
div=demo.find_all('div')
for i in div:
try:
data_link=i.attrs['data-link']
list_url.append(data_link)
except:
continue
def html_room_data(html,list_room_data):
demo=BeautifulSoup(html,'html.parser')
room_name=demo.find('div', attrs={'class':'basic-info'}).find('h1').text #name
list_room_data['楼盘']=room_name
dl=demo.find('dl',attrs={'class':'basic-parms clearfix'})
dd=dl.find_all('dd')
dt=dl.find_all('dt')
for i in range(len(dt)):
try:
list_room_data[dt[i].text.strip()]=str.strip(dd[i].text)
except:
continue
try:
price=dl.find('em',attrs={'class':'sp-price other-price'}).text #价格
price_company=dl.find('dd',attrs={'class':'price'}).find('span').text #单位
list_room_data['价格']=price
list_room_data['单位']=price_company
except:
price=''
price_company=''
try:
price_i=dl.find('i',attrs={'class':'sp-price other-price'}).text #周边价格
list_room_data['价格']=''
list_room_data['周边价格']=price
except:
list_room_data['周边价格']=''
try:
list_room_data['户型']=demo.find('div',attrs={'class':'house-item g-overflow'}).text
except:
pass
addr=demo.find('a',attrs={'class':'lpAddr-text g-overflow'}).text # 地址
list_room_data['地址']=addr
def datasave(ls_data,table_name):
conn=pymysql.connect('localhost','root','root.123',charset='utf8',database='gethtml')
cursor=conn.cursor()
sql='insert into %s('% table_name + ','.join(i.strip() for i in ls_data) + ') values(' +','.join('%r' %ls_data[i] for i in ls_data)+ ');'
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
def data_exists(table_name,url_no):
conn=pymysql.connect('localhost','root','root.123',charset='utf8',database='gethtml')
cursor=conn.cursor()
sql='select ID from %s where ID="%s"' %(table_name,url_no)
cursor.execute(sql)
return cursor.fetchone()
def geturl(start_url,list_url,depth=5,end_url='/'):
for i in range(1,depth):
try:
url=start_url+str(i)+end_url
html=gethtml(url)
html_url_data(html,list_url)
except Exception as e:
print('geturl_error\n',e.args)
def get_new_room():
list_url=[]
table_name='building_data'
start_url='https://xm.fang.anjuke.com/loupan/all/p'
geturl(start_url,list_url)
for room_url in list_url:
try:
list_room_data={}
url_no=re.search(r'[\d]+',room_url).group(0)
if data_exists(table_name,url_no)==None:
html=gethtml(room_url)
if html=='':
continue
html_room_data(html,list_room_data)
list_room_data['ID']=url_no
datasave(list_room_data,table_name)
except Exception as e:
print(room_url,'\n',e.args)
continue
def get_old_room():
list_url=[]
table_name='old_hourse_data'
start_url='https://xm.anjuke.com/sale/p'
end_url='/#filtersort'
depth=20
get_old_url(start_url,list_url,depth,end_url)
with open('url.txt','a',encoding='utf-8') as f:
f.write(str(list_url)+'\n')
count=0
count_add=0
print('\n开始获房子数据')
for old_hourse_url in list_url:
count=count+1
list_old_hourse_data={}
url_no=re.search(r'[A-Z][\d]+',old_hourse_url).group(0)
try:
if data_exists(table_name,url_no)==None:
html=gethtml(old_hourse_url)
if html=='':
continue
html_old_hourse_data(html,list_data=list_old_hourse_data)
list_old_hourse_data['ID']=url_no
datasave(list_old_hourse_data,table_name)
count_add=count_add+1
print('\r当前速度:新增:%s/处理:%s/总计:%s'%(count_add ,count,len(list_url)), end='')
except:
continue
def get_old_url(start_url,list_url,depth,end_url):
for i in range(1,depth+1):
url=start_url+str(depth)+end_url
html=gethtml(url)
demo=BeautifulSoup(html,'html.parser')
list_item=demo.find_all('li',attrs={'class':'list-item'})
count=1
for i in list_item:
ti=i.find('a',attrs={'class':'houseListTitle'})
list_url.append(re.search(r'.*[A-Z][\d]+',ti.attrs['href']).group(0))
print('\r获取网址:%s' % (len(list_url)),end='')
count=count+1
def html_old_hourse_data(html,list_data):
demo=BeautifulSoup(html,'html.parser')
li=demo.find_all('li',attrs={'class':'houseInfo-detail-item'})
for i in li:
div=i.find_all('div')
count=1
a=""
for j in div:
if count==1:
a=''.join(re.sub(r'[\:]','',j.text.strip()).split())
elif count==2:
b=''.join(re.sub(r'[\:]','',j.text.strip()).split())
list_data[a]=b
else:
continue
count+=1
def main():
get_new_room()
get_old_room()
main()