# -*- coding: utf-8 -*- """ Created on Sat Oct 22 21:01:23 2016 @author: hhxsym """ import requests import json import os import pymongo import time from bs4 import BeautifulSoup from multiprocessing import Pool #进程调用的包 inpath="C:\\Users\\hhxsym\\Desktop\\课程群Python爬虫" inpath = unicode(inpath , "utf8") os.chdir(inpath) #不做编码转换后,中文路径无法打开,更改 #连接数据库 client =pymongo.MongoClient('localhost', 27017) #连接数据库 sense = client['sense'] #创建数据库 url_list = sense['url_list'] #创建数据库表 def get_city_urls(): url = 'http://www.senseluxury.com' with open('city.html') as f: #本地读取 response = f.read() #直接读取到文本 soup = BeautifulSoup(response, 'lxml') urls = soup.select('#destination_nav > div > div > div > dl.dl-list > dt > a') #CSS结构类型,注意空格 return [url.get("href") for url in urls] def get_page_list(city, page=1): now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) url = 'http://www.senseluxury.com/destinations_list/%s' % city.split('/')[-1] payload = {'page':page, 'callback':'jsonp'} responses = requests.get(url,payload) #请求网页,获得响应的内容,requests.get(url地址,关键字url参数) #print responses.url print responses.status_code #print responses.text[6:-1] #打印json格式的“字符串” (1) wb_data = json.loads(responses.text[6:-1]) #将字符串转换为python的字典 (2) print type(responses.text), type(wb_data) #对比两种类型 (1)(2)对比 #print json.dumps(wb_data, encoding='utf-8', ensure_ascii=False) #json.dumps方法,变换成中文打印 #通过循环获取数据 for i in wb_data['val']['data']: title = i['title'] url = 'http://www.senseluxury.com'+i['url'] #数据拼接,获得我们想要的数据 id = i['id'] server=i['server'].replace(' ',' ').split() memo = i['memo'] price = i['price'] address = i['address'] subject =i['subject'] data = {'title':title, 'id':id, 'server':server,'memo':memo, 'prie':price, 'address':address, 'subject':subject, 'create_time':now} url_list.insert_one(data) #插入数据(字典) # 注:生成的数据列表中 _id是自动生成的 #print title, url print data if __name__=='__main__': #get_page_list(1) #print get_city_urls() #get_page_list('http://www.senseluxury.com/destinations/2', page=1) city_urls = get_city_urls() print city_urls pool = Pool(processes=4) #设置进程数量 pool.map(get_page_list, city_urls) #pool.map(函数名称,迭代对象) pool.close() # 等待进程池中所有进程执行结束之后再关闭 pool.join() #关闭之后要计入它,作用:防止主程序在子进程结束前关闭 # 网页json类型的查看:浏览器 -> 右键 检查 -> network -> XHR ->页面触发(跳转页面) -> name勾选-> Response ->查看是否出现json格式字符串 # http://jsoneditoronline.org/ 在线格式化网站,查看接送嵌套格式