这是我以前写的一个基于Chrome浏览器的京东爬虫,使用了selenium库和Chrome浏览器,实验性质的脚本,可以根据不同的商品名称,抓取京东商城上的商品明细列表,并存入MySQL数据库。
京东爬虫的Github项目地址
ChromeDriver下载镜像链接
python3安装教程
Docker安装MySQL5.7和8
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
DROP TABLE IF EXISTS `jdshop`;
CREATE TABLE `jdshop` (
`sid` int(11) NOT NULL AUTO_INCREMENT COMMENT '商品ID',
`keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '搜索词',
`shop` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '店铺名称',
`label` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '前置标签',
`title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品标题',
`advertising` varchar(1000) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '广告',
`price` decimal(8, 2) NULL DEFAULT NULL COMMENT '价格',
`pinggou` varchar(40) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '拼购价',
`plus` decimal(8, 2) NULL DEFAULT NULL COMMENT '会员价',
`comment` varchar(30) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '评论数',
`tag` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '打标',
`isale` int(11) NULL DEFAULT NULL COMMENT '销量排名',
`ctime` datetime(0) NULL DEFAULT NULL,
PRIMARY KEY (`sid`) USING BTREE,
UNIQUE INDEX `sid_UNIQUE`(`sid`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 6001 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
# 待抓取的商品名称
KEYWORD = '手机'
# mysql 连接字符串
conn = pymysql.connect(host='192.168.72.128', port=3306, user='admin', passwd='XXXXXX', db='dahlindb')
import re
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pg
import json
import csv
from bs4 import BeautifulSoup
import pymysql
INDEX = 0
CREATE_TIME = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
KEYWORD = '手机'
URL = 'https://search.jd.com/Search?keyword={key1}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={key2}&page={page}&s={amount}&psort=3&cid2=653&cid3=655&click=0'
MAX_PAGE = 100
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(browser, 10)
conn = pymysql.connect(host='192.168.72.128', port=3306, user='admin', passwd='XXXXX', db='dahlindb')
sql_insert = "INSERT INTO jdshop (keyword,shop,label,title,advertising,price,pinggou,plus,comment,tag,isale,ctime) " \
"VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
cursor = conn.cursor()
def filter_title(thtml):
"""
Extract tags and titles
:param thtml:
:return:
"""
label, title = '', ''
front_title = re.search('(.*?) ', thtml, re.S)
if front_title and len(front_title.group()) > 1:
label = front_title[1]
result = re.search('(.*?)', thtml, re.S)
temp = re.sub('' , '', result[1])
temp = re.sub('' , ' ', temp)
temp = re.sub('', '', temp)
title = re.sub(' ', ' ', temp)
return label, title
def index_page(index):
"""
Grab page data based on index code
:param page:index
:return:
"""
try:
amount = 1
page = 1
if index > 1:
amount = (index-1)*60+1
page = index+2
print("正在扒取第{page}页".format(page=index))
url = URL.format(key1=quote(KEYWORD), key2=quote(KEYWORD), page=page, amount=amount)
browser.get(url)
for i in range(1, 5):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.gl-item'))
)
html = browser.page_source
get_products(html)
except TimeoutException as e:
print(e)
def get_products(html):
"""
Extract product detail data
:return:
"""
global INDEX
doc = pg(html)
items = doc('.gl-item').items()
for item in items:
pinggou = item.find('.price-pingou').text().replace('\n', ' ').strip('¥')
INDEX = INDEX+1
thtml = item.find('.p-name').html()
label, title = filter_title(thtml)
if item.find('.p-tag3').attr('src'):
label = '京东精选'
shop = item.find('.p-shop').text().strip('\n')
advertising = item.find('.promo-words').text()
plus = item.find('.price-plus-1').text().strip('¥')
comment = item.find('.p-commit').text().replace('\n条评价', '').strip('二手有售').strip('\n')
tag = item.find('.p-icons').text().replace('\n', '-')
sprice = BeautifulSoup(item.find('.p-price').html(), 'lxml')
price = sprice.i.string
plus = plus if plus != '' else price
if not (price.split('.')[0]).isdigit():
price = "0"
plus = "0"
insert_db = (KEYWORD, shop, label, title, advertising, price, pinggou, plus, comment, tag, INDEX, CREATE_TIME)
print(insert_db)
try:
effect_now = cursor.executemany(sql_insert, [insert_db, ])
conn.commit()
except Exception as e:
print(e)
print('The {} is running,return value is {}! '.format(INDEX, cursor.lastrowid))
def main():
"""
Traversing a hundred pages of data
:return:
"""
for i in range(1, MAX_PAGE+1):
index_page(i)
time.sleep(2)
browser.close()
cursor.close()
conn.close()
if __name__ == '__main__':
main()