爬去京东笔记本电脑920页,每页60条自己算
# _*_ coding: utf-8 _*_
__author__ = 'qyg'
__date__ = '2019/3/28 8:45'
import requests
import json
import urllib3.contrib.pyopenssl
from lxml import etree
import random
import csv
import time
pages = 920
class Craw_JD():
def __init__(self,pages):
urllib3.contrib.pyopenssl.inject_into_urllib3()
self.start_url = 'https://item.jd.com/'
self.pages = pages
self.url = 'https://list.jd.com/list.html?cat=670,671,672&page=' # 构造请求头京东的商品有规律,cat指的是某个商品类型
self.USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', # 添加请求头,躲避机器检查
'Connection': 'Keep-Alive',
}
def response_handler(self):
for page in range(0,self.pages): # 遍历所有的页码
url_ = self.url + str(page) 传递页码
prox = self.pro_x() # 获取代理
self.headers['User-Agent'] = random.choice(self.USER_AGENTS)# 随机数选择请求头
res = requests.get(url=url_, proxies=prox, headers=self.headers, verify=False) # verify=False关闭SSL验证
res=etree.HTML(res.text) # 使用xpath解析页面
items = res.xpath('//*[ @id = "plist"]/ul/li/div/div[1]/a/@href') # xpath 获取 每一页所有的详情地址标签 href属性值样式://item.jd.com/8674557.html
responses = [requests.get(url='https:'+str(item),proxies=prox,headers = self.headers,verify=False) for item in items if items is not None] # 列表推到式 每个页面的所有响应放入列表
print('取出60条-----')
self.parse(responses,items)# 返回响应 交给parse去解析页面提取字段
def pro_x(self):
'''代理ip自动选择部分 使用choice函数对序列自动选择并筛选'''
prox = dict() # 声明 字典类型 代理也是k-v映射 ’http‘ : ’ip:port‘
ips = []
with open('ip.txt', 'r') as fp:# 读取 ip.txt文件 将ip读取到列表中
while True:
line = fp.readline() # 每次读取一行
ips.append(line.rstrip()) # rstrip函数消除右端空字符(回车字符)并添加到 ips列表中
if not line:
break
prox['http'] = random.choice(ips)
try: # proxies=prox,使用代理 timeout=2 2秒内返回则正常
res = requests.get(url='https://item.jd.com/', proxies=prox, headers=self.headers, timeout=2)
if res.status_code == 200:
return prox #测试 ip代理 能否使用 能则返回 不能继续调用函数随机选择 ip
except:
self.pro_x()
def parse(self,responses,items):
data = []
if responses:
for index,res in enumerate(responses):
html=etree.HTML(res.text)
url=items[index]
d_name = html.xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a/text()')
if d_name:
d_name=d_name[0]
else:
continue
strId = url.split("/")
ids = strId[3].split(".")
sku_id = ids[0]
name=html.xpath('//*[@id="parameter-brand"]/li/@title') #品牌名字//*[@id="parameter-brand"]/li/a
if name :
name = name[0] # 获取字符串
else:
continue # 没有就跳过本条
XL,GoodRade= self.get_xiaoliang(sku_id) # 销量 和 好评率
price = self.get_price(sku_id)# 价格
# comms = self.get_comm(sku_id,page=1)
# if comms:
# comms = '|'.join(comms)
# else: # 获取评论数据不稳定,如果还想爬去评论信息,自行解除注释,data添加评论字段
# comms = '此用户无评价或者没有抓到' # 这部分因为代理ip是免费的网站爬的不稳定。所以我注释掉了
good_info = html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li/text()')
good_info=','.join(good_info)
data.append(['sku_id:'+str(sku_id),'sku_name:'+name,'d_name:'+d_name,'price:'+str(price),'销量:'+str(XL),'GoodRade:'+str(GoodRade),good_info])
self.save_data(data)
else:
print('400')
def get_price(self,sku_id):
’‘’和获取好评率 和销售量 get_xiaoliang()差不多 看下边的介绍不重复解释了‘’‘
url = 'https://p.3.cn/prices/mgets?skuIds='+str(sku_id)
while True: # 一直循环,知道访问站点成功
try:
# 以下except都是用来捕获当requests请求出现异常时,
# 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
prox=self.pro_x()
req = requests.get(url,proxies=prox, headers=self.headers, timeout=20)
break
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
#t = requests.get(url,headers = self.headers,timeout=3,verify=False)
dd = json.loads(req.text)
prcie=dd[0].get('p')
return prcie
# def get_comm(self,sku_id,page):
# '''获取评论---每个商品获取一页 10条'''
# comms = []
# proc = self.pro_x()
# commUrl = "https://sclub.jd.com/comment/productPageComments.action?&productId={0}&score=0&sortType=5&pageSize=10&page={1}&isShadowSku=0&fold=1" .format(sku_id,page)
# while True: # 一直循环,知道访问站点成功
# try:
# # 以下except都是用来捕获当requests请求出现异常时,
# # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
# req = requests.get(commUrl,proxies=proc, headers=self.headers, timeout=20)
# if req.content:
# dd = json.loads(req.text)
# break
# except requests.exceptions.ConnectionError:
# print('ConnectionError -- please wait 3 seconds')
# time.sleep(3)
# except requests.exceptions.ChunkedEncodingError:
# print('ChunkedEncodingError -- please wait 3 seconds')
# time.sleep(3)
# except:
# print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
# time.sleep(3)
#
# tags = dd.get("comments")
# for i in tags:
# con = i.get('content')
# comms.append(con)
# return comms
def get_xiaoliang(self,sku_id):
’‘’获取销售量and好评率json api接口请求在下边 ‘’‘
url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + str(sku_id)
while True: # 一直循环,知道访问站点成功
try:
# 以下except都是用来捕获当requests请求出现异常时,
# 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
prox=self.pro_x()
req = requests.get(url,proxies=prox, headers=self.headers, timeout=20)
if req.status_code==200:
break
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
dd = json.loads(req.text) # 返回的数据是json格式的 所以使用 json 包去反序列化获取键值
comment = dd.get("CommentsCount")
ss = comment[0]
comment_sum = ss.get('CommentCount') # 评论总数
goodrate = ss.get("GoodRate") # 好评率
return comment_sum,goodrate
def save_data(self,data): # newline='' 保证没有空行出现
with open('jd_data.csv', 'a+', encoding='utf-8',newline='') as fp:
writer = csv.writer(fp) # 写入使用csv 包
for row in data:
writer.writerow(row) # 每次写入一行 一个循环 写入一页60条
print("60已经存储")
def main():
craw=Craw_JD(pages)
urllib3.disable_warnings()# 消除ssl警告
urllib3.contrib.pyopenssl.inject_into_urllib3() # openssl验证
# monkey.patch_ssl()
craw.response_handler()
if __name__ == '__main__':
main()
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from lxml import etree
# 请求路径,西刺代理网站
url = 'https://www.xicidaili.com/?tdsourcetag=s_pctim_aiomsg'
# 请求响应头
headers = header = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1'}
# 通过requests的get方法访问目标网站,获得响应对象
response = requests.get(url=url, headers=headers)
print(response.text)
# 创建一个etree对象,response.text为访问后的到的整个西刺代理页面
etree_obj = etree.HTML(response.text)
# 通过筛选response.text,得到包含ip信息的列表
ip_list = etree_obj.xpath("//tr[@class='odd']")
item = []
# 遍历得到的集合,将ip,和端口信息进行拼接,添加到item列表
for ip in ip_list:
ip_num = ip.xpath('./td[2]/text()')[0]
port_num = ip.xpath('./td[3]/text()')[0]
http = ip_num + ':' + port_num
item.append(http)
ips = []
with open('ip.txt','r') as fp:
while True:
line = fp.readline()
ips.append(line.rstrip())
if not line:
break
print(ips) # 打印已经存在的
# 遍历访问,检测IP活性
for it in item:
# 因为并不是每个IP都是能用,所以要进行异常处理
try:
proxy = {
'http': it
}
url1 = 'https://www.baidu.com/'
# 遍历时,利用访问百度,设定timeout=1,即在1秒内,未送到响应就断开连接
res = requests.get(url=url1, proxies=proxy, headers=headers, timeout=1)
if res.status_code == 200:
if it not in ips: # 测试是否已经存在
with open('ip.txt', 'a+') as fp:
fp.write(it + '\n') # 没有重复就写入
else:
print(it,'已经存在')
# 打印检测信息,elapsed.total_seconds()获取响应的时间
print(it + '--', res.elapsed.total_seconds())
except BaseException as e:
print(e)
用到hadoop大数据处理,python对字段的清洗,后便发现 商品介绍字段的商品名称不好处理就删掉了
# 这里仅贴出来 python正则表达式 清洗数据部分
# s数据清洗 清除 最后一列的第一个不规则 字段 K-V 商品名称:
import re
import csv
a = []
with open('jd_data2.csv','r',encoding='utf-8') as fp:
reader = csv.reader(fp)
for i in reader:
# print(i[6])
str5 = re.sub(r'商品名称.*商品编号', '商品编号', i[6])
i[6] = str5
a.append(i)
print(i)
with open('jd_clear.csv','a',encoding='utf-8',newline="") as f:
writer=csv.writer(f)
for one in a:
writer.writerow(one)
# coding=utf8
"""
Migrate pyecharts and Flask with custom template functions.
"""
from __future__ import unicode_literals
from flask import Flask, render_template
from flask.templating import Environment
from pyecharts import HeatMap, Map, Pie, Line, Bar,WordCloud,TreeMap
from pyecharts.engine import ECHAERTS_TEMPLATE_FUNCTIONS
from pyecharts.conf import PyEchartsConfig
from models import Computer_Cpu,Computer_Price,Computer_Brand
from exts import db
# ----- Adapter ---------
class FlaskEchartsEnvironment(Environment):# 集成jinja2模板
def __init__(self, *args, **kwargs):
super(FlaskEchartsEnvironment, self).__init__(*args, **kwargs)
self.pyecharts_config = PyEchartsConfig(jshost='/static/js') # 使用本地 echarts js文件
self.globals.update(ECHAERTS_TEMPLATE_FUNCTIONS) # 添加模板函数到全局字典中。
# ---User Code ----
class MyFlask(Flask):# 继承Flask
jinja_environment = FlaskEchartsEnvironment # 指定 Flask EchartsEnvironment 为默认模板引擎
app = MyFlask(__name__)
# 配置数据库url
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql://root:12345@localhost:3306/*****?' \
'charset=utf8'
db.init_app(app)
@app.route("/")
def index():
return render_template('index.html')
@app.route("/brand/")
def computer_brand():
'''笔记本销量 牌子 '''
data_list=Computer_Brand.query.all()
brands=[]
count=[]
for item in data_list:
brands.append(item.brand)
count.append(item.count)
pie = Pie('笔记本-销量/牌子Info',title_pos='center',width=1000)
pie.add('',brands,count,center=[25,50],is_random=True,radius=[30,65],rosetype='area',is_legend_show=False,
is_label_show=True)
# Computer_Count.query.all().order_by
pie.add('',brands,count,center=[80,50],is_random=True,radius=[30,65],rosetype='radius',is_legend_show=False)
return render_template('Com_brand.html', hm=pie)
@app.route("/cpu/")
def Compu_cpu():
'''笔记本销量 CPU版本关系'''
cpus = []
count = []
data_list = Computer_Cpu.query.all()
for item in data_list:
# print(item.cpu_version)
cpus.append(item.cpu_version)
count.append(item.count)
bar = Bar("笔记本销量-CPU关系柱状图图")
# bar.use_theme('dark')
bar.add("cpu型号", cpus, count, mark_point=["min", "average","max",])
# return 'hello world'
return render_template('Com_cpu.html',hm=bar)
@app.route("/price/")
def Compu_Price():
'''笔记本 价格'''
prices = []
count = []
data_list = Computer_Price.query.all()
for item in data_list:
prices.append(item.price)
count.append(item.count)
line = Line("笔记本-价格需求分析折线图")
line.add("哈哈", prices, count, mark_point=["average"])
return render_template('Com_price.html', hm=line)
if __name__ == '__main__':
app.run(debug=True)
models.py 模型类
# _*_ coding: utf-8 _*_
__author__ = 'qyg'
__date__ = '2019/3/29 15:47'
from exts import db
class Computer_Brand(db.Model):
__tablename__ = 'computer_brand' # 表名
'''可视化model字段'''
id = db.Column(db.Integer,primary_key=True)
brand = db.Column(db.String(255))
count = db.Column(db.Integer)
class Computer_Cpu(db.Model):
__tablename__ = 'computer_cpu'
'''可视化model字段'''
id = db.Column(db.Integer,primary_key=True)
cpu_version = db.Column(db.String(255))
count = db.Column(db.BigInteger)
class Computer_Price(db.Model):
__tablename__ = 'computer_price'
'''可视化model字段'''
id = db.Column(db.Integer,primary_key=True)
price = db.Column(db.String(255))
count = db.Column(db.Integer)
# _*_ coding: utf-8 _*_
__author__ = 'qyg'
__date__ = '2019/4/2 13:24'
from flask_sqlalchemy import SQLAlchemy
db = SQLAlchemy()