python爬取当当商城信息并保存至数据库

由于最近自己在写个电商项目,需要大量的商品信息,故学习了下怎么爬取现有的商城商品信息。

爬取页面

爬取页面:http://category.dangdang.com/cid4002644.html,利用xpath检索到需要的数据,利用item={}(即map)将需要的数据保存其他,最终通过json将map保存到json/data.sjon中。
代码如下:

# -*- coding: utf-8 -*-
import requests
import lxml.html
import json

def parse_url(xiaohua_url, headers):
    response = requests.get(xiaohua_url, headers=headers)
    return response.content.decode("gbk")

def get_data(html_content):
    metree=lxml.html.etree
    # 解析对象
    parser=metree.HTML(html_content,metree.HTMLParser())
    # 解析获得在当前校花中的所有信息
    div_list=parser.xpath('//div[@id="search_nature_rg"]/ul[@class="bigimg cloth_shoplist"]/li')
    # print(div_list)
    result=[]
    index=0
    for element in div_list:
        index+=1
        item={}
        # item["top_title"]=element.xpath('./div[@class="goods-list-item  c-goods  J_pro_items"]/@id')
        item["top_title"]=element.xpath('./a/@title')[0]
        if index <= 8:
            item["pict_src"]=element.xpath('./a/img/@src')[0]
        if index>8:
            item["pict_src"]=element.xpath('./a/img/@data-original')[0]
        item["price"] = element.xpath('./p[@class="price"]/span[@class="price_n"]/text()')[0]
        result.append(item)
        # print(item)
    return result


def save_res_file(res_datas):
    json_strs = json.dumps(res_datas, ensure_ascii=False, indent=2)
    with open("./json/data5.json", "w", encoding="utf-8") as files:
        files.write(json_strs)
    print("保存成功")


def main():
    xiaohua_url = "http://category.dangdang.com/cid4002644.html"
    headers = {
         "User-Agent": "Mozi424/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/524.36 (KHTML, like Gecko) Chrome/72.0.3617.111 Safari/414.36"}
         }
    html_data = parse_url(xiaohua_url, headers)
    res_datas = get_data(html_data)
    save_res_file(res_datas) 


if __name__ == '__main__':
    main()

生成的json文件如下所示:

[
  {
    "top_title": " [当当自营]EGISOO御姬秀天然蜂蜜手工皂100g 控油保湿 洁面皂洗脸皂",
    "pict_src": "http://img3m5.ddimg.cn/32/11/60616445-1_b_17.jpg",
    "price": "¥10.00"
  },
  {
    "top_title": " 【跨店每满100减50】【买2送1】莱玫睡眠护唇膜滋润淡化唇色纹唇部护理补水去死皮保湿女润唇膏秋冬必备",
    "pict_src": "http://img3m8.ddimg.cn/68/2/1176367568-1_b_7.jpg",
    "price": "¥19.90"
  },
  {
    "top_title": " 【跨店每满100减50】莱玫 水润修护芦荟胶300g 祛痘淡印晒后修护补水保湿舒缓凝胶",
    "pict_src": "http://img3m5.ddimg.cn/50/25/1132628855-1_b_7.jpg",
    "price": "¥29.90"
  },
  {
    "top_title": " [当当自营]宝妈日记 保湿洁面乳100ml-天然温和 洁净补水 洗完不紧绷 痘痘肌敏感肌温和首选 孕妇护肤品 孕妇洗面",
    "pict_src": "http://img3m3.ddimg.cn/84/23/60611943-1_b_25.jpg",
    "price": "¥10.00"
  },
  .......
]

保存至数据库

打开data.json,通过json模块解析文件,与数据库连接并将数据一一导入数据库,代码如下:

# -*- coding: utf-8 -*-
import json
import pymysql

def get_data():
    with open('./json/data5.json', 'r',encoding="utf-8") as f:
        my_text = json.load(f)  # 解析每一行数据
    return my_text


def data_insert(a):
    db=pymysql.connect("localhost","root","123456","ocean_shop")
    cursor=db.cursor()
    index=15614544323500
    for a_text in a:
        index+=1
        insert_ca="insert into os_item(id,title,price,image) VALUES(%s,%s,%s,%s)"
        cursor=db.cursor()
        price=a_text['price']
        print(price)

        price=(price.split('¥')[1])
        price=price.split('.')[0]+price.split('.')[1]
        # print(price)
        cursor.execute(insert_ca,[index,a_text['top_title'],price,a_text['pict_src']])
        db.commit()
    cursor.close()

if __name__ == '__main__':
    a=get_data()
    data_insert(a)

导入成功!

多页插入

即如何利用xpath爬取多页的信息呢?
其实很简单,通过对网址的对比可以发现
首页地址:
http://category.dangdang.com/cid4002644.html
第二页地址:
http://category.dangdang.com/pg2-cid4002644.html
第三页地址:
http://category.dangdang.com/pg3-cid4002644.html
发现只要很简单地进行url填充就行,具体代码如下:

# -*- coding: utf-8 -*-
import requests
import lxml.html
import json

def parse_url(xiaohua_url, headers):
    response = requests.get(xiaohua_url, headers=headers)
    return response.content.decode("gbk")

def get_data(html_content,all_data):
    metree=lxml.html.etree
    # 解析对象
    parser=metree.HTML(html_content,metree.HTMLParser())
    # 解析获得在当前校花中的所有信息
    div_list=parser.xpath('//div[@id="search_nature_rg"]/ul[@class="bigimg cloth_shoplist"]/li')
    # print(div_list)
    index=0
    for element in div_list:
        index+=1
        item={}
        # item["top_title"]=element.xpath('./div[@class="goods-list-item  c-goods  J_pro_items"]/@id')
        item["top_title"]=element.xpath('./a/@title')[0]
        if index <= 8:
            item["pict_src"]=element.xpath('./a/img/@src')[0]
        if index>8:
            item["pict_src"]=element.xpath('./a/img/@data-original')[0]
        item["price"] = element.xpath('./p[@class="price"]/span[@class="price_n"]/text()')[0]
        all_data.append(item)
        # print(item)
    return all_data


def save_res_file(res_datas):
    json_strs = json.dumps(res_datas, ensure_ascii=False, indent=2)
    with open("./json/data6.json", "w", encoding="utf-8") as files:
        files.write(json_strs)
    print("保存成功")


def main():
    all_data=[]
    for index in range(1,10):
        if index==1:
            xiaohua_url = "http://category.dangdang.com/cid4002644.html"
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.413 Safari/537.36",
                "Referer":"https://s.taobao.com/search?q=%E6%B4%97%E8%A1%A3&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20190421&ie=utf8"}
            html_data = parse_url(xiaohua_url, headers)
            all_data = get_data(html_data,all_data)
        xiaohua_url = "http://category.dangdang.com/pg"+str(index)+"-cid4002644.html"
        headers = {
            "User-Agent": "Mozi424/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/524.36 (KHTML, like Gecko) Chrome/72.0.3617.111 Safari/414.36"
            }
        html_data = parse_url(xiaohua_url, headers)
        all_data = get_data(html_data, all_data)
    save_res_file(all_data)


if __name__ == '__main__':
    main()

你可能感兴趣的:(python爬虫)