最近在学习爬虫,写了个比较简单的程序,抓取北京二手房房价信息。
# -*- coding: utf-8
import urllib2
import urllib
import re,os
import time
#from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class HomeLink:
#初始化数据
def __init__(self,base_url):
self.base_url = base_url
self.page = 1
self.out_put_file = 'D:/python/test/house.txt'
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : self.user_agent }
#获取页面内容
def get_content(self,url):
try:
request = urllib2.Request(url,headers=self.headers)
response = urllib2.urlopen(request)
act_url = response.geturl()
print 'init url=',url,'act url=',act_url
if url == act_url:
content = response.read()
return content
else:
return None
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接页面失败,错误原因",e.reason
return None
#获取每个区的起始url链接地址
def get_region_url(self):
d_region_url = {}
content = self.get_content(self.base_url)
pattern = re.compile('(.*?)',re.S)
result = re.findall(pattern,content)
if result:
for x in result:
d_region_url[x[1]] = x[0]
else:
pass
return d_region_url
#获取每个区的所有页面url地址列表
def get_region_url_list(self,region_url):
page_num = self.get_page_num(region_url)
l_url = [region_url+'pg'+str(i)+'/' for i in range(2,page_num+1)]
return l_url
#获取总页数
def get_page_num(self,url):
content = self.get_content(url)
pattern = re.compile('{"totalPage":(\d+),"curPage":1}',re.S)
result = re.search(pattern,content)
if result:
return int(result.group(1).strip())
else:
return None
#获取每套房子的房价信息
def get_house_info(self,url,region):
content = self.get_content(url)
pattern = re.compile('' +
'(.*?)(.*?) .*?
.*?
'
+ '.*?(\d+)(\S+)
',re.S)
result = re.findall(pattern,content)
if result:
for x in result:
l = x[1].split('|')
rooms,area,direct,other = l[1],l[2],l[3],l[4]
s_str = '|'.join([region,x[0],rooms,area,direct,other,x[2],x[3]])
self.writeStr2File(self.out_put_file,s_str)
else:
return None
#开始抓取链家网房价数据
def start_scrapy(self):
d_region_url = self.get_region_url()
for k in d_region_url:
region = k
region_init_url = 'http://bj.lianjia.com' + d_region_url[region]
l_region_url = self.get_region_url_list(region_init_url)
for url in l_region_url:
time.sleep(1)
url = url.strip()
self.get_house_info(url,region)
#写文件
def writeStr2File(self,out_put_file,str1,append = 'a'):
# 去掉文件,保留路径。比如 'a/b/c/d.txt' 经过下面代码会变成 'a/b/c'
subPath = out_put_file[:self.out_put_file.rfind('/')]
# 如果给定的路径中,文件夹不存在,则创建
if not os.path.exists(subPath):
os.makedirs(subPath)
# 打开文件并将 str 内容写入给定的文件
with open(out_put_file, append) as f:
f.write(str1.strip()+'\n')
url = 'http://bj.lianjia.com/ershoufang/'
home = HomeLink(url)
home.start_scrapy()
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/10972173/viewspace-2134897/,如需转载,请注明出处,否则将追究法律责任。