利用Python爬虫爬取京东(小规模)

一.开发环境
本文运行环境为Windows10+Python3.7
使用的第三方库有selenium(操作浏览器)+pymysql(数据库)+bs4(解析)+chrome+ChromeDriver

二.先决条件
利用京东的搜索结果,然后把结果保存起来
那么接下来就是找到京东搜索的相关url。因为可见即可爬。

base_url = "https://search.jd.com/Search?"
params = {
        "keyword": "android",
        "enc": "utf-8",
        "qrst": 1,
        "page": offset * 2 + 1,
        "wq": "android"
    }
    browser.get(base_url + urlencode(params))

上面的一段代码并不完整,这个只是示例如何构建一个模拟浏览器搜索的操作

三.完整代码
所以到这里,所有的思路就是:
1)先找到要爬取的界面
2)分析界面dom,找到需要的信息
3)解析
4)保存到数据库

第一步,先创建数据库

def init_database():
    create_table_sql = "CREATE TABLE IF NOT EXISTS jd_android (id INT NOT NULL, name VARCHAR(255) NOT NULL)"
    try:
        db = pymysql.connect(host="localhost", user="root", password="root", port=3306, db="spiders")
        cursor = db.cursor()
        cursor.execute(create_table_sql)
    finally:
        if db:
            db.close()

第二步,开始模拟浏览器操作

def spider_by_chrome():
    browser = webdriver.Chrome()
    db = pymysql.connect(db="spiders", user="root", password="root", port=3306)
    for index in range(10):
        get_source_page(browser, index, db)
    browser.close()

注意这里,数据库操作没有做相关处理(诸如单例),在小规模不会出问题,当数据量变大时可能会出现读写瓶颈。这里在未来会进行优化。

第三步,解析并保存到数据库

def get_source_page(browser, offset, db):
    '''

    :param browser: the browser we use to spider
    :param offset: the page to search
    :param db: database to store data
    :return: the final html page
    '''

    params = {
        "keyword": "android",
        "enc": "utf-8",
        "qrst": 1,
        "page": offset * 2 + 1,
        "wq": "android"
    }
    browser.get(base_url + urlencode(params))
    # input = browser.find_element_by_id("key")
    # input.send_keys("android")
    # input.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, "J_searchWrap")))
    html = BeautifulSoup(browser.page_source, "lxml")
    for item in html.find_all("li", class_="gl-item"):
        book_name = item.select("em")[1]
        print(book_name.text)
        global book_id
        cursor = db.cursor()
        cursor.execute("INSERT INTO jd_android (id, name) values (%s, %s)", (book_id, book_name.text))
        book_id = book_id + 1
        # item_xml = BeautifulSoup(item, "lxml")
        # print(item_xml.find_all("em"))
    db.commit()#保存

同样的,数据库操作都十分简陋,有很大的优化空间。错误回滚等操作都没有处理。

综上,是基本的爬虫操作。

四.完整代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlencode
import pymysql
from bs4 import BeautifulSoup

base_url = "https://search.jd.com/Search?"
book_id = 1

def get_source_page(browser, offset, db):
    '''

    :param browser: the browser we use to spider
    :param offset: the page to search
    :param db: database to store data
    :return: the final html page
    '''

    params = {
        "keyword": "android",
        "enc": "utf-8",
        "qrst": 1,
        "page": offset * 2 + 1,
        "wq": "android"
    }
    browser.get(base_url + urlencode(params))
    # input = browser.find_element_by_id("key")
    # input.send_keys("android")
    # input.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, "J_searchWrap")))
    html = BeautifulSoup(browser.page_source, "lxml")
    for item in html.find_all("li", class_="gl-item"):
        book_name = item.select("em")[1]
        print(book_name.text)
        global book_id
        cursor = db.cursor()
        cursor.execute("INSERT INTO jd_android (id, name) values (%s, %s)", (book_id, book_name.text))
        book_id = book_id + 1
        # item_xml = BeautifulSoup(item, "lxml")
        # print(item_xml.find_all("em"))
    db.commit()


def spider_by_chrome():
    browser = webdriver.Chrome()
    db = pymysql.connect(db="spiders", user="root", password="root", port=3306)
    for index in range(10):
        get_source_page(browser, index, db)
    browser.close()



def init_database():
    create_table_sql = "CREATE TABLE IF NOT EXISTS jd_android (id INT NOT NULL, name VARCHAR(255) NOT NULL)"
    try:
        db = pymysql.connect(user="root", password="root", port=3306, db="spiders")
        cursor = db.cursor()
        cursor.execute(create_table_sql)
    finally:
        if db:
            db.close()


init_database()
spider_by_chrome()

你可能感兴趣的:(Python爬虫学习)