python爬虫豆瓣租房

import urllib.request
import urllib.error
import urllib.response
import urllib.parse
import time
import re

pagenum = []

def GetInfo(page):
    url = 'https://www.douban.com/group/topic/' + str(page)
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    try:
        response = urllib.request.urlopen(url)
        content = response.read().decode('utf-8')
        pattern = re.compile('.*?

(.*?)

'
,re.S) items = re.findall(pattern,content) for item in items: replacePP = re.compile('

') info = re.sub(replacePP,"\n",item) print ("----------------------------------------------------------") print (info) print ("----------------------------------------------------------") except urllib.error.URLError as e : if hasattr(e, "code"): print (e.code) if hasattr(e, "reason"): print (e.reason) def GetPageNum(start): url = 'https://www.douban.com/group/beijingzufang/discussion?start=' + str(start) try: response = urllib.request.urlopen(url) content = response.read().decode('utf-8') pattern = re.compile('.*?.*?',re.S) items = re.findall(pattern,content) for item in items: #print (item) #print (item[36:-3]) pagenum.append(item[36:-3]) except urllib.error.URLError as e : if hasattr(e, "code"): print (e.code) if hasattr(e, "reason"): print (e.reason) if __name__ == '__main__': print("请输入最新需要条数: ") AllTitles = int(input()) StartPage = int( AllTitles / 25 ) for i in range(1, StartPage+1): GetPageNum(i * 25) for num in pagenum: GetInfo(num)

你可能感兴趣的:(python)