pyhton爬虫 爬取电商平台商品历史价格、最低价格(慢慢买网)

主要使用的库:

requests:爬虫请求并获取源码
re:使用正则表达式提取数据
json:使用JSON提取数据
pandas:使用pandans存储数据
#!coding=utf-8
import requests
import os
import re
import json
import datetime
import time
import pandas as pd
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import win32api,win32con

def raw(text):  # 转化URL字符串

    escape_dict = {
            '/': '%252F',
            '?': '%253F',
            '=': '%253D',
            ':': '%253A',
            '&': '%26',

                   }
    new_string = ''
    for char in text:
        try:
            new_string += escape_dict[char]
        except KeyError:
            new_string += char
    return new_string


def mmm(item):
    item=raw(item)
    url='https://apapia.manmanbuy.com/ChromeWidgetServices/WidgetServices.ashx'
    s=requests.session()
    headers={
        'Host':'apapia.manmanbuy.com',
        'Content-Type':'application/x-www-form-urlencoded; charset=utf-8',
        'Proxy-Connection':'close',
        'Cookie':'ASP.NET_SessionId=uwhkmhd023ce0yx22jag2e0o; jjkcpnew111=cp46144734_1171363291_2017/11/25',
        'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 mmbWebBrowse',
        'Content-Length':'457',
        'Accept-Encoding':'gzip',
        'Connection':'close',
    }
    postdata='c_devid=2C5039AF-99D0-4800-BC36-DEB3654D202C&username=&qs=true&c_engver=1.2.35&c_devtoken=&c_devmodel=iPhone%20SE&c_contype=wifi&' \
         't=1537348981671&c_win=w_320_h_568&p_url={}&' \
         'c_ostype=ios&jsoncallback=%3F&c_ctrl=w_search_trend0_f_content&methodName=getBiJiaInfo_wxsmall&c_devtype=phone&' \
         'jgzspic=no&c_operator=%E4%B8%AD%E5%9B%BD%E7%A7%BB%E5%8A%A8&c_appver=2.9.0&bj=false&c_dp=2&c_osver=10.3.3'.format(item)
    s.headers.update(headers)
    req=s.get(url=url,data=postdata,verify=False).text

    #print(req)
    try:
        js=json.loads(req)
        title = js['single']['title']  ##名称
    except Exception as e:
        print(e)
        #exit(mmm(item))
###数据清洗
    pic=js['single']['smallpic']  ##图片
    jiagequshi=js['single']['jiagequshi']  ##价格趋势
    lowerPrice=js['single']['lowerPrice']  ##最低价格
    lowerDate=js['single']['lowerDate']  ##最低价格日期
    lowerDate=re.search('[1-9]\d{0,9}',lowerDate).group(0)
    #print(lowerDate)
    lowerDate=time.strftime("%Y-%m-%d", time.localtime(int(lowerDate)))
    itemurl=js['single']['url']  ##商品链接
    qushi=js['single']['qushi']  ##趋势
    changPriceRemark=js['single']['changPriceRemark']   ##趋势变动
    date_list=[]   ##日期
    price_list=[]  ##价格
 ##日期转换   datalist=jiagequshi.replace('[Date.UTC(','').replace(')','').replace(']','').split(',')
    for i in range(0,len(datalist),4):

        if i !=0:
            day = int(datalist[i + 2])
            if int(datalist[i + 1]) == 12:
                mon = 1
                year = int(datalist[i]) + 1
            else:
                mon = int(datalist[i + 1]) + 1
                year = int(datalist[i])
            date = datetime.date(year=year, month=mon, day=day)
            date = date - datetime.timedelta(days=1)
            price = float(datalist[i -1])
            date_list.append(date)
            price_list.append(price)


        day=int(datalist[i + 2])
        if int(datalist[i+1])==12:
            mon=1
            year=int(datalist[i])+1
        else:
            mon=int(datalist[i+1])+1
            year = int(datalist[i])

        date=datetime.date(year=year,month=mon,day=day)
        price=float(datalist[i+3])
        date_list.append(date)
        price_list.append(price)

    data={'date_日期':date_list,'price_价格':price_list}
    df = pd.DataFrame(data)
    df.loc[:, "title_名称"] = title
    df.loc[:, "pic_图片"] = pic
    df.loc[:, "lowerPrice_最低价格"] = lowerPrice
    df.loc[:, "lowerDate_最低价格日期"] = lowerDate
    df.loc[:, "itemurl_商品链接"] = itemurl
    df.loc[:, "qushi_趋势"] = qushi
    df.loc[:, "changPriceRemark_趋势变动"] = changPriceRemark

    df.to_csv('out.csv',index=False,mode='a',encoding="GB18030")  ##保存数据
    # print(df)
    #return df


if __name__ == '__main__':
    item='https://detail.tmall.com/item.htm?id=538801983798'   ##京东、淘宝、天猫等电商平台数据都可以获取
    mmm(item)
  

 

你可能感兴趣的:(python,爬虫)