import requests
'''
北京接口:https://www.amap.com/service/weather?adcode=110000
天津接口:https://www.amap.com/service/weather?adcode=120000
adcode接口:https://www.amap.com/service/cityList?version=201951410
'''
class Gao(object):
def __init__(self):
self.run()
def run(self):
# 声明一个base_url
base_url = "https://www.amap.com/service/weather?adcode="
# 获取所有城市adcode
adcode_list = self.get_adcode()
# print(adcode_list)
# for循环adcode 去获取城市天气信息
for c, adcode_dict in enumerate(adcode_list, 1):
# 获取adcode 用于拼接完整的url
adcode = adcode_dict["adcode"]
# 拼接完整的url
full_url = base_url + adcode
# print(full_url)
# 发情请求 获取天气json数据
response = requests.get(full_url)
json_data = response.json()
# print(json_data)
# 第一种方式
# msg = json_data.get("data").get("message")
# if msg == "Successful.":
# # 获取天气信息
# weather_name = json_data.get("data").get("data")[0].get("live").get("weather_name")
# # print(weather_name)
#
# # 将天气信息 加入到adcode_dict中
# adcode_dict["weather_name"] = weather_name
# print(c, adcode_dict)
# else:
# print(msg)
# 第二种方式
try:
# 获取天气信息
weather_name = json_data.get("data").get("data")[0].get("live").get("weather_name")
# print(weather_name)
# 将天气信息 加入到adcode_dict中
adcode_dict["weather_name"] = weather_name
print(c, adcode_dict)
except Exception as e:
print(e)
# 获取所有城市adcode
def get_adcode(self):
# 定义adcode接口
base_url = "https://www.amap.com/service/cityList?version=201951410"
# 发起请求
response = requests.get(base_url)
# print(response.json())
# print(response.text)
# 获取json数据
json_data = response.json()
# 获取adcode列表
city_by_letter = json_data.get("data").get("cityByLetter")
# print(city_by_letter)
# 声明一个列表 放所有的城市字典
city_list = []
# 循环遍历字典中的值
# for city_list1 in city_by_letter.values():
# # print(city_list1)
# # 第一种方式
# for city_dict in city_list1:
# print(self.count, city_dict)
# city_list.append(city_dict)
# self.count += 1
# 循环遍历字典中的值
for city_list1 in city_by_letter.values():
# 第二种方式
city_list += city_list1
print(city_list)
# print(len(city_list))
# 所有存放城市字典的列表
return city_list
if __name__ == '__main__':
Gao()
'''
总结:
列表相加 可以使用+=号, city_list += city_list1
判断某个信息 时候成功获取到,接口返回的信息 会告诉你,只有在获取到信息之后
我们才对指定 信息进行获取 ,如果不这样的话 会报错,影响代码正常运行,
此时应该写异常处理
'''
import requests
import time
import random
import hashlib
def md5(value):
# 创建MD5对象
md5_obj = hashlib.md5()
# 加密字符串
md5_obj.update(bytes(value, encoding="utf-8"))
# 进行16位的加密
sign = md5_obj.hexdigest()
return sign
def youdao(i):
# 获取salt
salt = str(int(time.time() * 1000)) + str(random.randint(0, 9))
# print(salt)
# 获取sign
sign1 = "fanyideskweb" + i + salt + "@6f#X3=cCuncYssPsuRUE"
sign = md5(sign1)
# 定义data参数
data = {
"i": i,
# "from": "AUTO",
# "to": "AUTO",
# "smartresult": "dict",
"client": "fanyideskweb",
"salt": salt,
"sign": sign,
# "ts": "1558514897639",
# "bv": "cf156b581152bd0b259b90070b1120e6",
# "doctype": "json",
# "version": "2.1",
"keyfrom": "fanyi.web",
# "action": "FY_BY_REALTlME"
}
# 加上请求头 浏览器信息
headers = {
# "Accept": "application/json, text/javascript, */*; q=0.01",
# "Accept-Encoding": "gzip, deflate",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Connection": "keep-alive",
# "Content-Length": "238",
# "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "[email protected]; OUTFOX_SEARCH_USER_ID_NCOO=1844201936.6123636; _ga=GA1.2.1939912746.1552966532; JSESSIONID=aaaB9UfpkFL02gnEynoRw; ___rl__test__cookies=1558514897636",
# "Host": "fanyi.youdao.com",
# "Origin": "http://fanyi.youdao.com",
"Referer": "http://fanyi.youdao.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
# "X-Requested-With": "XMLHttpRequest"
}
# 定义起始url
base_url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"
# 发送请求
response = requests.post(base_url, data=data, headers=headers)
# 获取response里面的json数据
json_data = response.json()
print(json_data)
print(type(json_data))
if __name__ == '__main__':
i = input("请输入需要翻译的内容:")
# i = "banana"
youdao(i)
"""
遇到的问题1:
""只携带参数data 发起请求的时候,请求不到数据,出现{'errorCode'"": ""50},",",
此时的解决方案是:加上请求头浏览器信息 再次发起请求
""问题2:还是获取不到信息 {'errorCode'"": ""50}",",
解决的方案是:把所有的请求头信息添加到headers中
"""
# i: banana
# client: fanyideskweb
# salt: 15585168560444
# sign: da50e3193cda496e1455ff28c1bb21b1
# keyfrom: fanyi.web
#
# i: apple
# "client": "fanyideskweb",
# "salt": "15585148976393",
# "sign": "147950af9758d1e79aeaacd4ff27d14d",
# "keyfrom": "fanyi.web",
#
#
# salt: 需要看一下 是否需要加密
# sign: 也要看一下是否需要加密
# 首要解决的问题是salt和sign生成的过程
'''
salt = o.salt = i = r + parseInt(10 * Math.random(), 10)
"" + (new Date).getTime() + parseInt(10 * Math.random(), 10) js
= "" + int(time.time() * 1000) + random.randint(0, 9)
o = r.generateSaltSign(t) = r(t)
r.generateSaltSign(t) = t.generateSaltSign(t) = r(t)
{
ts: r,
bv: t,
salt: i,
sign: n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
}
e = t = "apple" 需要翻译的内容
var r = function(e)
{
var
t = n.md5(navigator.appVersion),
r = "" + (new Date).getTime(),
i = r + parseInt(10 * Math.random(), 10);
return {
ts: r,
bv: t,
salt: i,
sign: n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
}
};
长度比较:
15585816225096 python
15585148976393 js
15585822104216
sign = o.sign
= n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
= md5("fanyideskweb" + "apple" + salt + "@6f#X3=cCuncYssPsuRUE")
'''
import re
import requests
class Lang:
def __init__(self):
self.run()
def run(self):
# 获取页面信息
base_url = "http://www.langlang2017.com/"
response = requests.get(base_url)
html = response.content.decode("utf-8")
print(html)
self.get_data(html)
def get_data(self, html):
# 缩小范围
pattern1 = re.compile('联系电话:(\d{11})', html)[0]
# phone1 = re.search('1\d{10}', html)
# group(1)选择匹配到的第一个括号的内容
# phone = phone1.group()
print(phone1)
# 获取所有的http连接 有两个
http_list = re.findall('"(http.*?)"', html)
# print(http_list)
# 获取地址
address = re.search('地址:(.*?)', html)
address = address.group(1)
lang_dict = {
"alt_list": alt_list,
"src_list": src_list,
# "phone": phone,
"http_list": http_list,
"address": address
}
print(lang_dict)
import json
print(json.dumps(lang_dict))
if __name__ == '__main__':
Lang()
')
result1 = pattern1.search(html).group()
# print(result1)
# 获取alt信息 只返回括号中的内容 括号外面的不返回
alt_list = re.findall('alt="(.*?)"', result1)
# print(alt)
# 获取src信息
src_list1 = re.findall('src="(.*?)"', result1)
# print(src_list1)
# 获取完整的src图片链接
src_list = []
for s in src_list1:
src = "http://www.langlang2017.com/" + s
# print(src)
src_list.append(src)
# print(src_list)
# 提示:从html里面获取
# 获取电话号码
phone1 = re.findall('
import requests
import re
class Mao:
def __init__(self):
'''
本质是初始化一些条件,并不是调用其他函数
当前类实例化的时候触发
'''
self.count = 1
self.spider_name = "万能爬虫"
# self.run()
def __call__(self, *args, **kwargs):
'''
当前类的对象当做函数使用的时候触发
'''
self.run()
def run(self):
# 获取猫眼的html信息 字符串信息
base_url = "https://maoyan.com/board"
response = requests.get(base_url)
html = response.text
# print(html)
self.get_data(html)
def get_data(self, html):
# 缩小范围 获取电影dd
# 换行不能用.*? 要是用\s\S re.S 可以是\n 换行变为普通字符 .就可以匹配到\n
dd_list = re.findall('.*? ', html, re.S)
# print(dd_list)
# print(dd_list[0])
# print(len(dd_list))
# import json
# print(json.dumps(dd_list))
# 循环获取dd中的电影信息
for dd in dd_list:
# print(dd)
# 获取排名
rank = re.findall('(\d+)', dd)[0]
# print(rank)
# 获取电影名称
name = re.findall('title="(.*?)" class', dd)[0]
# print(name)
# 获取主演信息
actor = re.findall('([\d\D]*?)
', dd)[0]
if "主演" not in actor:
actor = ""
else:
# 去掉前面和后面的空白
actor = actor.strip()
# print(actor)
# 上映日期
publish_date = re.findall('上映时间:(.*?)
', dd)[0]
# print(publish_date)
# 评分信息
score_match = re.search('(.*?)(\d)
', dd)
# print(score.group(1))
# print(score.group(2))
score = score_match.group(1) + score_match.group(2)
# print(score)
# 获取图片
# 浏览器获取到的信息 和代码获取到的信息 有时候不一样
# 写正则表达式的时候 以代码获取到的字符串 为准
# 在获取图片的过程当中 优先获取大图
pic = re.findall('data-src="(.*?)@160w_220h_1e_1c"', dd)[0]
# print(pic)
# 将电影信息 存入字典中
movie_dict = {
"rank": rank,
"name": name,
"actor": actor,
"publish_date": publish_date,
"score": score,
"pic": pic
}
print(movie_dict)
if __name__ == '__main__':
mao = Mao()
mao()
import re
import requests
class Fang:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_max_page()
# 获取最大页码
def get_max_page(self):
base_url = "http://gz.ihk.cn/myxf/houselist/?mark=gzxf089"
html = self.get_html(base_url)
max_page_list = re.findall('', html)
# print(div_list)
# print(div_list[0])
# print(len(div_list))
for div in div_list:
# 获取图片
pic = re.findall('data-original="(.*?)"', div)[0]
# print(pic)
# 获取新房名称
name = re.findall('(.*?)', div)[0]
# print(name)
# 描述信息
desc = re.findall('[\s\S]*?([\w\W]*?)', div)[0]
# print(desc)
# 主力户型
house_type = re.findall('(.*?)', div)[0]
# print(house_type)
# 地址
address = re.findall('(.*?)', div)[0].strip()
# print(address)
# 标签
sign = re.findall('(.*?)', div)
# print(sign)
# 价格
price = re.findall('(.*?)', div)[0]
# print(price)
fang_dict = {
"pic": pic,
"name": name,
"house_type": house_type,
"desc": desc,
"address": address,
"sign": sign,
"price": price
}
print(self.count, fang_dict)
self.count += 1
# 获取指定url的页面
def get_html(self, base_url):
response = requests.get(base_url)
html = response.text
# print(html)
return html
if __name__ == '__main__':
fang = Fang()
fang()
6.华夏基金
import requests
import re
class Hua:
def __init__(self):
pass
def __call__(self, *args, **kwargs):
# self.get_html()
self.get_data()
def get_html(self):
base_url = "http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp"
response = requests.get(base_url)
html = response.text
# print(html)
with open("hua.html", "w", encoding="utf-8") as f:
f.write(html)
def get_data(self):
with open("hua.html", "r", encoding="utf-8") as f:
html = f.read()
# print(html)
# print(type(html))
table_list = re.findall('[\s\S]*?', html)
# print(table_list)
# print(table_list[0])
# print(len(table_list))
#
# import json
# print(json.dumps(table_list))
for c, table in enumerate(table_list):
if c == 0:
print("==========")
# 缩小范围 后去tr列表
self.tr_list = self.table_handler(table)
# print(tr_list)
# print(tr_list[0])
# print(len(tr_list))
for co, tr in enumerate(self.tr_list, 1):
# 获取基金名称
name_fund_list = self.tr_hander(tr)
fund_list = name_fund_list[1]
fund_dict = {
"name": name_fund_list[0],
"code": fund_list[1].strip(),
"nw_date": fund_list[2],
"net_worth": fund_list[3],
"cum_worth": fund_list[4],
"price_limit": "" if fund_list[5] == "---" else fund_list[5],
"set_up_date": fund_list[6],
"purchase_status": fund_list[7],
"redemption_status": fund_list[8],
"cast_surely_status": fund_list[9],
}
print(co, fund_dict)
# break
elif c == 1:
print("============================")
# 缩小范围 获取tr
self.tr_list = self.table_handler(table)
# print(tr_list)
# print(tr_list[0])
# print(len(tr_list))
for co, tr in enumerate(self.tr_list, 1):
# 获取基金名称
name_fund_list = self.tr_hander(tr)
fund_list = name_fund_list[1]
fund_dict = {
"name": name_fund_list[0],
"code": fund_list[2].strip(),
"nw_date": fund_list[3],
"million_return": fund_list[4],
"seven_day_annualized_yield": fund_list[5],
"aror30": fund_list[6],
"the_year_aror": fund_list[7],
"set_up_date": fund_list[8],
"purchase_status": fund_list[9],
"redemption_status": fund_list[10],
"cast_surely_status": fund_list[11]
}
print(co, fund_dict)
elif c == 2:
print("===================================")
# 缩小范围 获取tr
self.tr_list = self.table_handler(table)
# print(len(tr_list))
# print(tr_list)
for co, tr in enumerate(self.tr_list, 1):
# 获取基金名称
name_fund_list = self.tr_hander(tr)
fund_list = name_fund_list[1]
fund_dict = {
"name": name_fund_list[0],
"code": fund_list[2].strip(),
"nw_date": fund_list[3],
"thousands_return": fund_list[4],
"seven_day_annualized_yield": fund_list[5],
"operation_period": "",
"set_up_date": fund_list[6],
"purchase_status": fund_list[7],
"redemption_status": fund_list[8],
"cast_surely_status": fund_list[9]
}
print(co, fund_dict)
else:
print("=============================")
# 缩小范围 获取tr
self.tr_list = self.table_handler(table)
# print(tr_list)
# print(len(tr_list))
for co, tr in enumerate(self.tr_list, 1):
# 获取name
name_fund_list = self.tr_hander(tr)
fund_list = name_fund_list[1]
fund_dict = {
"name": name_fund_list[0],
"code": fund_list[2].strip(),
"nw_date": fund_list[3],
"net_worth": "" if fund_list[4] == "--" else fund_list[4],
"cum_worth": "" if fund_list[5] == "--" else fund_list[5],
"set_up_date": fund_list[6],
"due_date": fund_list[7],
"cast_surely_status": "" if fund_list[8] == "---" else fund_list[8],
"trade_status": fund_list[9],
}
print(co, fund_dict)
# table中获取tr 缩小范围
def table_handler(self, table):
tr_list = re.findall('[\s\S]*?', table)
del tr_list[0]
return tr_list
def tr_hander(self, tr):
name = re.search('title="(.*?)"', tr).group(1)
fund_list = re.findall('(.*?) ', tr)
return name, fund_list
if __name__ == '__main__':
hua = Hua()
hua()
7. 糗事百科
import requests
from lxml import etree
from fake_useragent import UserAgent
class Qiu:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_max_page()
def get_max_page(self):
base_url = "https://www.qiushibaike.com/8hr/page/2/"
html_xml = self.get_html(base_url)
# 获取最大页码
max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
# print(max_page)
self.get_data(max_page)
def get_data(self, max_page):
for page in range(1, max_page + 1):
print("===================第{}页开始下载=========================".format(page))
page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
# print(page_url)
html_xml = self.get_html(page_url)
# 缩小范围
li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
# print(len(li_list))
for li in li_list:
# 获取图片
pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
# if "/w/150/h/112" in pic:
# pic = "https:" + pic[:-12]
# else:
# pic = ""
# 三元表达式 实现上面的代码
pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
# print(pic)
# 获取昵称
nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
# print(nike_name)
# 获取内容
content = li.xpath(".//a[@class='recmd-content']/text()")
content = content[0] if content else ""
# print(content)
# 获取好笑数量
laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
# if "万" in laught_num:
# laught_num = int(float(laught_num[:-1]) * 10000)
# else:
# laught_num = int(laught_num)
laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
# print(laught_num)
# 评论数量
comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
comment_num = int(comment_num[0]) if comment_num else 0
# print(comment_num)
qiu_dict = {
"pic": pic,
"nike_name": nike_name,
"content": content,
"laught_num": laught_num,
"comment_num": comment_num,
}
print(self.count, qiu_dict)
self.count += 1
def get_html(self, base_url):
# 随机产生一个浏览器信息
headers = {"User-Agent": UserAgent().random}
response = requests.get(base_url, headers=headers)
html = response.text
# print(html)
html_xml = etree.HTML(html)
return html_xml
if __name__ == '__main__':
qiu = Qiu()
qiu()
8. 爬取百姓网数据
import requests
from lxml import etree
class Bai:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_max_page()
def get_max_page(self):
base_url = "http://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D=3"
html_xml = self.get_html(base_url)
# 获取最大页码
max_page = int(html_xml.xpath("//ul[@class='list-pagination']/li[last()-1]/a/text()")[0])
# print(max_page)
# 获取数据
self.get_data(max_page)
def get_data(self, max_page):
# 循环获取每一页的xml对象 并获取其中的指定的数据
for page in range(1, max_page + 1):
print("================第{}页开始下载======================".format(page))
base_url = "http://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&page={}&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D=3".format(page)
# print(base_url)
html_xml = self.get_html(base_url)
# 缩小范围
li_list = html_xml.xpath("//ul[@class='list-ad-items']/li[@data-aid]")
# print(len(li_list))
# 遍历获取每条狗的信息
for co, li in enumerate(li_list, 1):
# 图片
pic = li.xpath(".//img/@src")[0]
if "http" not in pic:
pic = li.xpath(".//img/@data-originsource")
pic = pic[0] if pic else ""
# print(co, pic)
# 获取描述信息
desc = li.xpath(".//a[@class='ad-title']/text()")[0]
# print(co, desc)
# 获取地址信息
address = li.xpath(".//div/div[@class='ad-item-detail'][1]/text()")[0]
# print(address)
# 类型
dog_type = li.xpath(".//div/div[@class='ad-item-detail'][2]/text()")[0].strip()
dog_type = dog_type.replace(" ", "")
# print(dog_type)
# 获取价格
price = li.xpath(".//div/span/text()")[0]
# print(price)
dog_dict = {
"pic": pic,
"desc": desc,
"address": address,
"dog_type": dog_type,
"price": price,
}
print(self.count, dog_dict)
self.count += 1
# 获取指定url对应的xml对象
def get_html(self, url):
response = requests.get(url)
html = response.text
# print(html)
return etree.HTML(html)
if __name__ == '__main__':
bai = Bai()
bai()
9. 爬取房天下数据
import requests
from lxml import etree
import re
class Fang:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_max_page()
def get_max_page(self):
base_url = "https://zu.fang.com/house/i3100/"
html, html_xml = self.get_html(base_url)
max_page = int(re.search('共(\d+)页', html).group(1))
# print(max_page)
# 通过url后去指定页面的数据
self.get_data(max_page)
def get_data(self, max_page):
for page in range(1, max_page+1):
print("=================第{}页开始下载======================".format(page))
page_url = "https://zu.fang.com/house/i3{}/".format(page)
# 获取分页URL得页面
html, html_xml = self.get_html(page_url)
# 缩小范围
dl_list = html_xml.xpath("//div[@class='houseList']/dl[dt]")
# print(len(dl_list))
for co, dl in enumerate(dl_list, 1):
# 获取图片
pic = "https:" + dl.xpath(".//img/@data-src")[0]
pic = pic.replace('275x207', "1000x1000")
# print(co, pic)
# 标题
title = dl.xpath(".//a[@title]/@title")[0]
# print(co, title)
# 租房类型
rent_type = dl.xpath(".//dd/p[2]/text()[1]")[0].strip()
# print(rent_type)
# 室
fang_info = dl.xpath(".//dd/p[2]/text()[2]")[0]
# print(fang_info)
if "室" in fang_info:
room = re.findall('(\d+)室', fang_info)[0]
else:
room = ""
if "厅" in fang_info:
ting = re.findall("(\d+)厅",fang_info)[0]
else:
ting = ""
# print(co, room, ting)
# 面积
area = dl.xpath(".//dd/p[2]/text()[3]")[0]
area = area[:-2]
# print(area)
# 朝向
toward = dl.xpath(".//dd/p[2]/text()[4]")[0].strip()
# print(toward)
# 城区
city_area = dl.xpath(".//dd/p[3]/a[1]/span/text()")[0]
# print(city_area)
# 商圈
business_circle = dl.xpath(".//dd/p[3]/a[2]/span/text()")[0]
# print(business_circle)
# 小区
community = dl.xpath(".//dd/p[3]/a[3]/span/text()")
community = community[0] if community else ""
# print(community)
# 地址
address_list = dl.xpath(".//span[@class='note subInfor']//text()")
# print(address)
# 用空字符串 将列表中的元素 连接成一个字符串
address = "".join(address_list)
# print(address)
# 标签
sign_list = dl.xpath(".//dd/p[@class='mt12']/span/text()")
# print(sign_list)
# 价格
price = dl.xpath(".//span[@class='price']/text()")[0]
# print(price)
fang_dict = {
"pic": pic,
"title": title,
"rent_type": rent_type,
"room": room,
"ting": ting,
"area": area,
"toward": toward,
"city_area": city_area,
"business_circle": business_circle,
"community": community,
"address": address,
"sign_list": sign_list,
"price": price,
}
print(self.count, fang_dict)
self.count += 1
# break
# 获取指定url对应的xml对象
def get_html(self, url):
response = requests.get(url)
html = response.text
# print(html)
# with open("2.html", "r", encoding="utf-8") as f:
# html = f.read()
return html, etree.HTML(html)
if __name__ == '__main__':
fang = Fang()
fang()
10. 爬取豆瓣电影排名数据
import requests
from fake_useragent import UserAgent
from selenium import webdriver
from lxml import etree
import time
import re
class Dou:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_data()
# 获取最大页码
def get_data(self):
page = 42
while True:
print(f"=================第{page+1}页开始下载===================")
base_url = "https://book.douban.com/subject_search?search_text=python&cat=1001&start={}".format(page*15)
html, html_xml = self.get_html(base_url)
if "查询错误" in html:
break
# print(base_url)
# 缩小范围 获取每本书的大div
div_list = html_xml.xpath("//div[@class='item-root']")
# print(div_list)
# print(len(div_list))
# 循环获取每本书的详细信息
for co, div in enumerate(div_list, 1):
# 获取图片
pic = div.xpath(".//img/@src")[0]
# print(co, pic)
# 获取书名
name = div.xpath(".//a[@class='title-text']/text()")[0]
# print(co, name)
# 评分
score = div.xpath(".//span[@class='rating_nums']/text()")
score = score[0] if score else ""
# print(score)
# 评价人数
comment_nums_str = div.xpath(".//span[@class='pl']/text()")
comment_nums_str = comment_nums_str[0] if comment_nums_str else ""
comment_nums = re.findall("\d+", comment_nums_str)
comment_nums = int(comment_nums[0]) if comment_nums else 0
# print(comment_nums)
# 获取出版社信息
desc_info = div.xpath(".//div[@class='meta abstract']/text()")
if desc_info:
desc_info = desc_info[0]
desc_info = desc_info.replace(" ", "")
else:
desc_info = ""
# print(desc_info)
book_dict = {
"pic": pic,
"name": name,
"score": score,
"comment_nums": comment_nums,
"desc_info": desc_info,
}
print(self.count, book_dict)
self.count += 1
page += 1
# break
# 获取指定url对应的网页信息
def get_html(self, url):
# 通过requests获取不到想要的页面信息 所以改用selenium获取
# headers = {"User-Agent": UserAgent().random}
# response = requests.get(url, headers=headers)
# html = response.text
# print(html)
# 创建浏览器对象
self.driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1-windows\phantomjs"
r"-2.1.1-windows\bin\phantomjs.exe")
# 使用无界面浏览器 发起请求
self.driver.get(url)
time.sleep(1)
# 获取页面信息
html = self.driver.page_source
# print(html)
# 将页面存入文件中 便于开发
# with open("3.html", "r", encoding="utf-8") as f:
# html = f.read()
# 返回一个xml对象
return html, etree.HTML(html)
def __del__(self):
"""
触发条件: 当所有代码执行完成 执行此函数
"""
# print(self.driver)
# print(type(self.driver))
self.driver.close() # 关闭页面
self.driver.quit() # 关闭浏览器
print("------浏览器已关闭-------")
if __name__ == '__main__':
dou = Dou()
dou()
淘宝接口
import requests
import json
from fake_useragent import UserAgent
'''
分析:
第一层:https://tce.taobao.com/api/mget.htm?callback=jsonp1606&tce_sid=1870316,1871653&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online
1870316 第一层
1870321 第二层
1870333 第三层
1870340 第四层
1870341 第五层
1870342 第六层
1870343 第七层
'''
class Tao:
def __init__(self):
pass
def __call__(self, *args, **kwargs):
self.get_data()
def get_data(self):
base_url = "https://tce.taobao.com/api/mget.htm?tce_sid=1870316,1870321,1870333,1870340,1870341,1870342,1870343&tce_vid=2,2,2,2,2,2,2"
headers = {"User-Agent": UserAgent().random}
# 对接口发起请求
response = requests.get(base_url, headers=headers)
# 获取字符串数据
str_data = response.text.strip()
# print(str_data)
# 获取json数据
json_data = json.loads(str_data)
# print(json_data)
count = 1
# 获取到的是一个字典
data_dict = json_data.get('result')
for i in data_dict.values():
data_list = i.get("result")
for data in data_list:
data["item_pic"] = "https:" + data["item_pic"]
print(count, data)
count += 1
if __name__ == '__main__':
tao = Tao()
tao()
11. 爬取网易云数据
import requests
from lxml import etree
from fake_useragent import UserAgent
class Music:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_class_url_list()
# 获取分类url列表
def get_class_url_list(self):
# 发起请求 获取指定页面
base_url = "https://music.163.com/discover/artist"
html_xml = self.get_html(base_url, 1)
# 获取分类url
class_url_list = html_xml.xpath("//a[@class='cat-flag']/@href")
class_name_list = html_xml.xpath("//a[@class='cat-flag']/text()")
del class_name_list[0]
del class_url_list[0]
# print(class_url_list)
# print(class_name_list)
# print(len(class_url_list))
# print(len(class_name_list))
for index in range(len(class_url_list)):
# index += 1
print("==============={}开始下载================".format(class_name_list[index]))
# 拼接完整的分类url
class_url = "https://music.163.com" + class_url_list[index]
# print(class_url)
# 通过分类url获取字母的url
self.get_alphabet_url(class_url)
# break
def get_alphabet_url(self, class_url):
# 获取分类url的页面 xml对象
html_xml = self.get_html(class_url, 1)
# 获取字母url列表
alphabet_url_list = html_xml.xpath("//ul[@class='n-ltlst f-cb']/li[position()>1]/a/@href")
# print(alphabet_url_list)
# 循环获取每个字母url对应歌手信息
for alphabet_url in alphabet_url_list:
# 拼接完整的字母url
alphabet_url = "https://music.163.com" + alphabet_url
self.get_singer_info(alphabet_url)
# break
def get_singer_info(self, alphabet_url):
# 根据字母url获取每个歌手的名称和对应的详情url
html_xml = self.get_html(alphabet_url, 1)
singer_name_list = html_xml.xpath("//a[@class='nm nm-icn f-thide s-fc0']/text()")
singer_url_list = html_xml.xpath("//a[@class='nm nm-icn f-thide s-fc0']/@href")
# print(singer_name_list)
# print(singer_url_list)
# print(len(singer_name_list))
# print(len(singer_url_list))
for index in range(len(singer_name_list)):
# 声明一个存放歌手信息的字典
singer_url = "https://music.163.com" + singer_url_list[index].strip()
# import json
# singer_dict = json.dumps(singer_dict)
# with open("singer.txt", "w", encoding="utf-8") as f:
# f.write(singer_dict + "\n")
html_xml = self.get_html(singer_url, 0)
# tbody在页面当中显示 但是在代码获取到的页面中一般不显示
hot_song = html_xml.xpath("//ul[@class='f-hide']/li/a/text()")
# print(hot_song)
singer_dict = {
"singer_name": singer_name_list[index],
"singer_url": singer_url,
"hot_song": hot_song
}
print(self.count, singer_dict)
self.count += 1
# break
# 获取指定url对应的页面信息
def get_html(self, url, sign):
'''
:param url: 要获取的url
:param sign: 用于判断使用哪个 headers,如果是1 则使用上面的headers 否则使用下面的headers
:return:
'''
# headers = {"User-Agent": UserAgent().random}
# if sign == 0:
headers = {
"cookie": "[email protected]:-1:1; mail_psc_fingerprint=7fb6c5032f50ce8c1a07fdb15fd2251d; _iuqxldmzr_=32; _ntes_nnid=ec024cec32803d4dfd5c42e4e40cba08,1552969997617; _ntes_nuid=ec024cec32803d4dfd5c42e4e40cba08; WM_TID=eZJB4FRfmstFBVFRVFZ508IkS9OSa6K6; usertrack=CrHtiVyQhXO2rmpiAwOpAg==; UM_distinctid=16a307022e2b3-0b705b12e3ccd3-414f0c2a-100200-16a307022e3361; NTES_CMT_USER_INFO=72051947%7Cm13349949963_1%40163.com%7Chttp%3A%2F%2Fcms-bucket.nosdn.127.net%2F2018%2F08%2F13%2F078ea9f65d954410b62a52ac773875a1.jpeg%7Cfalse%7CbTEzMzQ5OTQ5OTYzXzFAMTYzLmNvbQ%3D%3D; vinfo_n_f_l_n3=dd7e8b71253298e9.1.0.1555590818606.0.1555590912731; [email protected]|1558093033|0|mail163|00&99|gud&1557298197&urs#bej&null#10#0#0|133963&1||[email protected]; WM_NI=ROVoQSBgJquFTl4wFtlT0uStCW6f1tfWf3lX6czDHARSzgJQQaXu0QDk3vv%2BGl8GXFZhvOKF0OdWlzFB5MvSmfqUF%2B2c8YDTYjUbcM1JWQMmcQImmDpluWXxtf50voINRkI%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb4ae3fbbed98abef7d9a9a8bb2d85a939f9aaff763ac9a8c96ae79b5989da6f52af0fea7c3b92a92919a90d45982b98692f84e98b4fc98c580b08c0096d2808189fa87b480a689aad4ef54f6bdb6a5cb4b928db688c95b93bf9896b35b88b5fd97f52185b4f8a8db4e9ab8bab0ca4ef491acb8ef72869efbaef559afbabfb6c521f2bdf8bac7609bb69b83e247f39699b2d067a18f878ef050b4b4bbb8db74b8bafbd1f5658b929e8ccc37e2a3; __remember_me=true; gdxidpyhxdE=YoWfxdQEE%2BgYxhtnKi5zVBa4eaecS1%2F%2BR48h%2FgaKUjHCIj9OPH8QnoJuU4VE%2BYq4zYxRiKjDWw%2BR%2Bey3b9tDY4PDQSfKUjPQkuqfkPZY6oDRPPZouWGNpQMKNdSy8lpSY7W7Syf90lWTaOUXDzSavZz%5Cw4A1LcvEXNtkeBjksCD5L%2F7O%3A1559116416164; _9755xjdesxxd_=32; NETEASE_WDA_UID=1866933109#|#1559115550866; MUSIC_U=065d91e631703dfb7280fe33a565a5643bafb378927678189c0459a4967381afd261a8a054abc7f1c2a0cd2f9ccbfca9b9370d24fa62f9d6c26e43e3ad55584d850eee1fae4e41b77955a739ab43dce1; __csrf=b8c227a578ab1044087e44fe79d5b402; JSESSIONID-WYYY=blMRzR0VnxMzQI3YWDAisc30pDmUBmsJPcTiRP5bRK0eGtlnRzQnG4Ee963zZ9jzGlA1pX1VyCx8kOkqhCRWwDpAw84JQ4RetEJunCyMYUjgW5d5l4gPYKBTMPkBPiDD8pM9JGynKZei2c338XnVcZBC939OsBPXQR5UlDjc5pZf%2FCew%3A1559119405744"
}
response = requests.get(url, headers=headers)
html = response.text
# 只打印 歌手信息的页面
if sign == 0:
# print(html)
pass
return etree.HTML(html)
if __name__ == '__main__':
music = Music()
music()
'''
index {"singer": "ljj", "hot_song": ["", ""]}
{"ljj": ["", ""]}
'''
12. 爬取链家租房信息数据
import redis
import requests
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CityArea:
def __init__(self):
# 初始化redis连接
self.r = self.get_redis()
def __call__(self, *args, **kwargs):
self.get_city_area()
# redis数据库连接
def get_redis(self):
return redis.Redis(host="127.0.0.1", port=6379, db=1)
def get_city_area(self):
# 获取城区信息
base_url = "https://bj.lianjia.com/zufang/"
html_xml = self.get_html(base_url)
city_area_list = html_xml.xpath("//ul[@data-target='area']/li[position()>1]/a/@href | "
"//ul[@data-target='area']/li[position()>1]/a/text()")
print(city_area_list)
print(len(city_area_list))
for city_area in city_area_list:
if "zufang" in city_area:
city_area = "https://bj.lianjia.com" + city_area
print(city_area)
# 将城区信息插入数据库
self.r.rpush("city_area_list", city_area)
# 获取指定url对应xml页面
def get_html(self, url):
headers = {"User-Agent": UserAgent().random}
response = requests.get(url, headers=headers)
html = response.text
# print(html)
return etree.HTML(html)
class BusinessCircle(CityArea):
def __call__(self, *args, **kwargs):
self.get_business_circle()
# 通过城区url获取商圈url
def get_business_circle(self):
count = 1
# 查询城区信息
city_area_list = self.r.lrange("city_area_list", 0, -1)
# print(city_area_list)
for index in range(0, len(city_area_list), 2):
# print(index)
# 分别获取城区url和城区的名称
city_area_url = city_area_list[index].decode("utf-8")
city_area_name = city_area_list[index+1].decode("utf-8")
print(city_area_url, city_area_name)
# 获取城区url xml对象
html_xml = self.get_html(city_area_url)
# 获取商圈信息
business_circle_list = html_xml.xpath("//div[@id='filter']/ul[4]/li[position()>1]/a/@href | "
"//div[@id='filter']/ul[4]/li[position()>1]/a/text()")
print(business_circle_list)
for index in range(len(business_circle_list)):
# 获取商圈列表中的信息
business_circle = business_circle_list[index]
# 将城区和商圈用-连接起来 存入数据库
if index % 2 == 1:
business_circle = city_area_name + "-" + business_circle_list[index]
print(count, business_circle, type(business_circle))
# print(type(business_circle))
count += 1
# 存入数据库
self.r.rpush("business_circle_list", business_circle)
# break
class Lian(CityArea):
def __call__(self, *args, **kwargs):
self.conn_mysql()
self.count_ucid = 1
self.get_page_url()
def get_page_url(self):
# 查询数据库中的商圈信息
business_circle_list = self.r.lrange("business_circle_list", 0, -1)
# print(business_circle_list)
# 循环获取商圈url
for index in range(0, len(business_circle_list), 2):
# 分别获取商圈url和商圈名称
business_circle_url = business_circle_list[index].decode("utf-8")
# 拼接完整的商圈url
business_circle_url = "https://bj.lianjia.com" + business_circle_url
business_circle_name = business_circle_list[index+1].decode("utf-8")
print(f"==================={business_circle_name}开始下载====================")
print(business_circle_url, business_circle_name)
# 获取商圈url指定xml页面
html_xml = self.get_html(business_circle_url)
# 获取最大页码
max_page = html_xml.xpath("//div[@class='content__pg']/@data-totalpage")
# 如果获取不到最大页码 则max_page 为空列表 然后跳过本次循环
if not max_page:
continue
max_page = int(max_page[0])
# print(max_page, type(max_page))
# 循环生成分页url
for page in range(1, max_page+1):
# 拼接完整的分页url
page_url = business_circle_url + "pg{}/".format(page)
# print(page_url)
# 获取数据
self.get_data(page_url)
break
break
# 获取指定分页url的数据
def get_data(self, page_url):
# 获取分页url页面
html_xml = self.get_html(page_url)
# 缩小范围
div_list = html_xml.xpath("//div[@class='content__list']/div")
for div in div_list:
# 图片
pic = div.xpath(".//img/@data-src")[0]
pic = pic.replace("250x182", "2500x1800")
# print(pic)
# 标题
title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()")[0].strip()
# print(title)
# 城区
city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()")[0]
# 商圈
business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()")[0]
# print(city_area, business_circle)
# 面积
area = div.xpath(".//p[@class='content__list--item--des']//text()[4]")
area = area[0].strip() if area else "" # 空值处理
# print(area)
# 朝向
toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]")[0].strip()
# print(toward)
# 房间信息
fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]")[0].strip()
# print(fang_info)
room = re.findall("(\d+)室", fang_info) # 室
hall = re.findall("(\d+)厅",fang_info) # 厅
toilet = re.findall("(\d+)卫", fang_info) # 卫
# 空值处理
room = int(room[0]) if room else 0
hall = int(hall[0]) if hall else 0
toilet = int(toilet[0]) if toilet else 0
# print(room, hall, toilet)
# 发布时间
publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()")[0]
# print(publish_date)
# 标签
sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
# print(sign_list)
# 将标签转换为字符串
sign = "#".join(sign_list)
# print(sign)
# 价格
price = div.xpath(".//em/text()")[0]
# print(price)
# 详情url
detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href")[0]
# 拼接完整的详情url
detail_url = "https://bj.lianjia.com" + detail_url
# print(detail_url)
fang_dict = {
"pic": pic, "title": title, "city_area": city_area, "business_circle": business_circle,
"area": area, "toward": toward, "room": room, "hall": hall, "toilet": toilet,
"publish_date": publish_date, "sign": sign, "price": price, "detail_url": detail_url
}
self.parse_detail(fang_dict)
# 解析详情页
def parse_detail(self, fang_dict):
# print(fang_dict)
detail_url = fang_dict['detail_url']
print(detail_url)
# 获取详情url对应的xml对象
html_xml = self.get_html(detail_url)
floor = html_xml.xpath("//ul/li[@class='fl oneline'][8]/text()")
floor = floor[0] if floor else ""
# print(floor)
# 获取经纪人电话号码 不在页面中
# 电话号码在接口中
# phone = html_xml.xpath(".//p[@class='content__aside__list--bottom oneline phone']/text()")
# print(phone)
# 获取经纪人id号 ucid
ucid = self.get_ucid(html_xml)
# print(ucid)
# 获取house_code
house_code = re.findall("zufang/(.*?).html", detail_url)[0]
# print(house_code)
# 拼接完整的经纪人接口
agent_url = f"https://bj.lianjia.com/zufang/aj/house/brokers?" \
f"house_codes={house_code}&position=bottom" \
f"&ucid={ucid}"
# print(agent_url)
try:
# 获取接口中的信息
headers = {"User-Agent": UserAgent().random}
json_data = requests.get(agent_url, headers=headers).json()
# print(json_data)
phone = json_data.get("data")[house_code][house_code].get("tp_number")
# print(phone)
except Exception as e:
print(e)
phone = ''
# 将电话和楼层信息放到fang_dict中
fang_dict["floor"] = floor
fang_dict["phone"] = phone
self.insert_mysql(fang_dict)
def insert_mysql(self, fang_dict):
print(self.conn)
print(self.cur)
def conn_mysql(self):
# 创建数据库的连接对象
self.conn = pymysql.connect(host="127.0.0.1", user="root",
database="0218", charset="utf8")
# 创建操作数据库的对象
self.cur = self.conn.cursor()
def get_ucid(self, html_xml):
try:
ucid = html_xml.xpath("//span[@class='contact__im im__online']/@data-info")[0]
# print(ucid)
self.count_ucid = 1
return ucid
except Exception as e:
print(e)
if self.count_ucid == 3:
return ""
else:
self.count_ucid += 1
return self.get_ucid(html_xml)
# ucid = self.get_ucid() = self.get_ucid(html_xml) = ucid
if __name__ == '__main__':
# cityarea = CityArea()
# cityarea()
# 实例化BusinessCircle bc为当前类的对象 调用时触发__call__
# bc = BusinessCircle()
# bc()
lian = Lian()
lian()
'''
电话接口分析:
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259333770690183168&position=bottom&ucid=1000000026012783
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2234691835526389760&position=bottom&ucid=1000000023002201
'''