提示:以下是本篇文章正文内容,下面案例可供参考
#站酷网爬虫
import os
import requests
from lxml import etree
url = 'https://item.jd.com/100009077475.html'
headers = {
#没有user-agent,返回简短的html代码,js重定向到登录页面
'user-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
resp = requests.get(url,headers=headers)
html = resp.text
# print(html)
#将html代码,写入本地
import os
file_path = os.path.join(os.path.dirname(__file__),'1京东商品页响应临时.html')
with open(file_path,mode='w',encoding='utf-8') as f:
f.write(html)
dom = etree.HTML(html)
comment_pattern = '//div[@class="comment-item"]//p[@class="comment-con"]/text()'
comments = dom.xpath(comment_pattern)
for i in comments:
print(i)
#伪代码
def product_detail(pid):
pid,
comments = select * where id=pid
f"""
return httpResponse()
#伪代码
def product_ detail():
f"""
return httpResponse()
def get. comment by_ pid(101010997):
pass #post请求
select comments from comment where pid=101010997
return comments
import json
xiaoming_json_str = """
{
"name" : "小明",
"age" : 13,
"parent" : [
{
"name" : "小明爸爸",
"age" : 43
},
{
"name" : "小明妈妈",
"age" : 43
}
]
}
"""
#打印在控制台的结果看不出来变量是字符申还是字典,应该用type()函数判断
# print(type(xiaoming_json_str), xiaoming_json_str)
# json字符串-> python内 置数据结构
xiaoming_dict = json.loads(xiaoming_json_str)
print(type(xiaoming_dict), xiaoming_dict)
print(xiaoming_dict['name'])
for parent in xiaoming_dict['parent']:
print(parent['name'])
#python 内置数据结构 -> json字符串
students =[
{
'name': '小明' ,'age' : 13,'gender' : '男'},
{
'name': '小2' ,'age' : 13,'gender' : '男'},
{
'name': '小1' ,'age' : 13,'gender' : '男'},
]
xiaoming_json_str = json.dumps(students)
print(type(xiaoming_json_str), xiaoming_json_str)
<students>
<stu>
<name>小明</name>
<age>13</age>
</stu>
<stu>
<name>小红</name>
<age>11</age>
</stu>
</students>
import json
import requests
url = 'http://t.weather.itboy.net/api/weather/city/101100201'
resp = requests.get(url)
status_code = resp.status_code
weather_str = resp.text
weather_obj = json.loads(weather_str)
# print(type(weather_obj), weather_obj)
# json结构复杂,如何观察取数据
# 把返回值从浏觉器或控制复制到pycharm的临时.json文件中
# pycharm/code/reformat格式化json 文件,照着json文件层级取数据
weather_data = weather_obj['data']
day_weather_list = weather_data['forecast']
for day in day_weather_list:
# print(day)
date = day['date']
high = day['high']
low = day['low']
type = day['type']
print(f'今天2020.1.{date},天气{type},最{high},最{low}')
import json
import requests
base_url = 'https://club.jd.com/comment/productPageComments.action'
headers = {
#没有user-agent,返回简短的html代码,js重定向到登录页面
#
'user-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'Referer': 'https://item.jd.com/'
}
# tips:从开发者工具network请求头下面的query params复制下来再调整。使用编辑器列编辑模式 alt+shift+鼠标拖动。
params = {
# 'callback': 'fetchJSON_comment98', #回调
'productId': 100009077475, #商品id
'score': 0,
'sortType': 5,
'page': 1, #第n页
'pageSize': 10,
'isShadowSku': 0,
'rid': 0,
'fold': 0
}
for i in range(1,11):
params['page'] = i
# print(params)
resp = requests.get(base_url,headers=headers,params=params)
print(resp)
status_code = resp.status_code
#得到一个jsonp (跨域)
comment_json = resp.text
# print(comment_json)
# 京东评论接口返回jsonp格式,涉及跨域问题。需要将先jsonp转json。
# 方法1:python字符串方法删除固定长度无用字符串;2(推荐)上网找从jsonp过滤json正则;3本例中发现修改参数可以直接返回json
comment_obj = json.loads(comment_json)
comment_data = comment_obj['comments']
for comment in comment_data:
id = comment['id']
content = comment['content']
creation_time = comment['creationTime']
product_color = comment['productColor']
print(f'用户ID:{id}\n购买颜色:{product_color}\n评论:{content}')
print('*'*100)
import json
import requests
base_url = 'https://api.bilibili.com/x/web-interface/newlist'
headers = {
#没有user-agent,返回简短的html代码,js重定向到登录页面
#本次请求头只用伪造user-agent即可,但前端时间测试需要cookie字段
'user-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'Referer': 'https://www.bilibili.com/' #很重要
}
params = {
'rid' : 33,
'type' : 0,
'pn' : 1,
'ps' : 20,
'jsonp' : 'jsonp',
# 'callback' : 'jsonCallback_bili_310141257890750756'
}
for i in range(1,11):
params['pn'] = i
resp = requests.get(base_url,headers=headers,params=params)
status_code = resp.status_code
view_json = resp.text
view_obj = json.loads(view_json)
print(view_obj)
view_data = view_obj['data']
view_archives = view_data['archives']
for view in view_archives:
title = view['title']
print(title)