一.开发环境
本文运行环境为Windows10+Python3.7
使用的第三方库有selenium(操作浏览器)+pymysql(数据库)+bs4(解析)+chrome+ChromeDriver
二.先决条件
利用京东的搜索结果,然后把结果保存起来
那么接下来就是找到京东搜索的相关url。因为可见即可爬。
base_url = "https://search.jd.com/Search?"
params = {
"keyword": "android",
"enc": "utf-8",
"qrst": 1,
"page": offset * 2 + 1,
"wq": "android"
}
browser.get(base_url + urlencode(params))
上面的一段代码并不完整,这个只是示例如何构建一个模拟浏览器搜索的操作
三.完整代码
所以到这里,所有的思路就是:
1)先找到要爬取的界面
2)分析界面dom,找到需要的信息
3)解析
4)保存到数据库
第一步,先创建数据库
def init_database():
create_table_sql = "CREATE TABLE IF NOT EXISTS jd_android (id INT NOT NULL, name VARCHAR(255) NOT NULL)"
try:
db = pymysql.connect(host="localhost", user="root", password="root", port=3306, db="spiders")
cursor = db.cursor()
cursor.execute(create_table_sql)
finally:
if db:
db.close()
第二步,开始模拟浏览器操作
def spider_by_chrome():
browser = webdriver.Chrome()
db = pymysql.connect(db="spiders", user="root", password="root", port=3306)
for index in range(10):
get_source_page(browser, index, db)
browser.close()
注意这里,数据库操作没有做相关处理(诸如单例),在小规模不会出问题,当数据量变大时可能会出现读写瓶颈。这里在未来会进行优化。
第三步,解析并保存到数据库
def get_source_page(browser, offset, db):
'''
:param browser: the browser we use to spider
:param offset: the page to search
:param db: database to store data
:return: the final html page
'''
params = {
"keyword": "android",
"enc": "utf-8",
"qrst": 1,
"page": offset * 2 + 1,
"wq": "android"
}
browser.get(base_url + urlencode(params))
# input = browser.find_element_by_id("key")
# input.send_keys("android")
# input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, "J_searchWrap")))
html = BeautifulSoup(browser.page_source, "lxml")
for item in html.find_all("li", class_="gl-item"):
book_name = item.select("em")[1]
print(book_name.text)
global book_id
cursor = db.cursor()
cursor.execute("INSERT INTO jd_android (id, name) values (%s, %s)", (book_id, book_name.text))
book_id = book_id + 1
# item_xml = BeautifulSoup(item, "lxml")
# print(item_xml.find_all("em"))
db.commit()#保存
同样的,数据库操作都十分简陋,有很大的优化空间。错误回滚等操作都没有处理。
综上,是基本的爬虫操作。
四.完整代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlencode
import pymysql
from bs4 import BeautifulSoup
base_url = "https://search.jd.com/Search?"
book_id = 1
def get_source_page(browser, offset, db):
'''
:param browser: the browser we use to spider
:param offset: the page to search
:param db: database to store data
:return: the final html page
'''
params = {
"keyword": "android",
"enc": "utf-8",
"qrst": 1,
"page": offset * 2 + 1,
"wq": "android"
}
browser.get(base_url + urlencode(params))
# input = browser.find_element_by_id("key")
# input.send_keys("android")
# input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, "J_searchWrap")))
html = BeautifulSoup(browser.page_source, "lxml")
for item in html.find_all("li", class_="gl-item"):
book_name = item.select("em")[1]
print(book_name.text)
global book_id
cursor = db.cursor()
cursor.execute("INSERT INTO jd_android (id, name) values (%s, %s)", (book_id, book_name.text))
book_id = book_id + 1
# item_xml = BeautifulSoup(item, "lxml")
# print(item_xml.find_all("em"))
db.commit()
def spider_by_chrome():
browser = webdriver.Chrome()
db = pymysql.connect(db="spiders", user="root", password="root", port=3306)
for index in range(10):
get_source_page(browser, index, db)
browser.close()
def init_database():
create_table_sql = "CREATE TABLE IF NOT EXISTS jd_android (id INT NOT NULL, name VARCHAR(255) NOT NULL)"
try:
db = pymysql.connect(user="root", password="root", port=3306, db="spiders")
cursor = db.cursor()
cursor.execute(create_table_sql)
finally:
if db:
db.close()
init_database()
spider_by_chrome()