电商商品爬虫,亚马逊amazon采集源码

亚马逊是国际知名的电商平台,访问国际站需要梯子,而国内访问的话是cn国内站点,不同的地区有不同的站点,每个商品有一个id号,不同地区商品是存在差异的!

电商商品爬虫,亚马逊amazon采集源码_第1张图片

亚马逊amazon商品数据采集有点类似于采集百度搜索结果信息,协议头非常重要,除了ua之外,cookies头需要携带,要不然不能访问,国内国外站点一样!

感觉网站存在反爬,尤其是英文站点,因为网页源码价格信息与实际前端页面看到的价格信息存在差异!

输入商品id号采集商品相关信息!

国内站(cn)采集

电商商品爬虫,亚马逊amazon采集源码_第2张图片

采集效果

电商商品爬虫,亚马逊amazon采集源码_第3张图片

附源码

#国内亚马逊商品爬虫
#20200213 by微信:huguo00289




# -*- coding=utf-8 -*-
import requests
from fake_useragent import UserAgent
import re,os,time
from lxml import etree


def ua():
    ua=UserAgent()
    headers={
        'User-Agent':ua.random,
        'Cookie': 'x-wl-uid=1eZRN4GNhENdZSGdOrvzQEy2WvlxT/sXztd0uB1drNz9lanSFUVkDtpyWsVQQfwSjhXmvZLrY67w=; session-id=459-1321777-5720413; ubid-acbcn=459-5647010-5360714; lc-acbcn=zh_CN; i18n-prefs=CNY; session-token=g6hxLDDoHhzZLHWxd7FnNbtphW7mG7zCPY29lJB7vwUfa73azlZ8jPh8iS6M+c/4mKa3c/d/Pzgiv61e7sJx858blgOf+pmyxOtu55z5AlVE2nRoPAyWFMeG4OKmZQI3Lg5/MNhcN71PW9x2OkQWWLOeqcikSKmxqaEQL9qGyYcnTbrYggdlInP0pROsR8oz; session-id-time=2082787201l; csm-hit=tb:s-KV6TYQQV77AQ5HHBPD94|1581595664859&t:1581595666568&adb:adblk_yes'
             }
    return headers


#保存txt
def tx(id,text,path):
    print(f"正在保存商品数据..")
    with open(f'{path}{id}.txt','w') as f:
        f.write(text)
    print(f">>>保存商品数据成功!")


#下载图片
def down(img_url,img_name,path):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    print(f"下载{img_name}图片..")
    r=requests.get(img_url,headers=headers,timeout=10)
    with open(f'{path}{img_name}','wb') as f:
        f.write(r.content)
        print(f">>>下载{img_name}图片完成!")
    time.sleep(2)


def get_shopping(id):
    url=f"https://www.amazon.cn/dp/{id}"
    html=requests.get(url,headers=ua(),timeout=10).content.decode('utf-8')
    time.sleep(3)
    title=re.findall(r'"立即购买:(.+?)",',html,re.S)[0] #B&O PLAY by Bang & Olufsen Beoplay P6 便携式扬声器1140026 黑色
    print(title)
    path=f'{title}/'
    os.makedirs(path,exist_ok=True) #创建目录
    req=etree.HTML(html)
    price=req.xpath('//span[@id="priceblock_ourprice"]/text()')[0]
    print(price)
    text='%s%s%s%s%s'%(url,'\n',title,'\n',price)
    tx(id, text, path)
    imgs=req.xpath('//span[@class="a-button-text"]/img/@src')
    for img in imgs:
        if 'jpg' in img:
            imgurl=img.split('._')[0]
            img_url =f'{imgurl}.jpg'
            img_name = img_url.split('/')[-1]
            print(img_url,img_name)
            down(img_url, img_name, path)




if __name__=='__main__':
    id=input("请输入要采集的商品id(比如:B00C3YADSK):")
    get_shopping(id)

美国站(com)采集(需要梯子!!)

比较坑爹的梯子

我的展示页面(国内免费梯子)

电商商品爬虫,亚马逊amazon采集源码_第4张图片

tx的付费梯子

电商商品爬虫,亚马逊amazon采集源码_第5张图片

什么鬼我的配送不到?无价格显示?

不管了,价格pass

电商商品爬虫,亚马逊amazon采集源码_第6张图片

采集效果

电商商品爬虫,亚马逊amazon采集源码_第7张图片

附上源码参考:

#国外亚马逊商品爬虫
#20200213
#https://www.amazon.com/dp/B07S3659V2


# -*- coding=utf-8 -*-
import requests
from fake_useragent import UserAgent
import re,os,time,random
from lxml import etree


def ua():
    ua=UserAgent()
    sjs = random.randint(1111111, 9999999)
    #print(sjs)
    sj = str(sjs)
    headers={
        'User-Agent':ua.random,
        'Cookie': f'x-wl-uid=1eZRN4GNhENdZSGdOrvzQEy2WvlxT/sXztd0uB1drNz9lanSFUVkDtpyWsVQQfwSjhXmvZLrY67w=; session-id=459-1321777-{sj}; ubid-acbcn=459-5647010-{sj}; lc-acbcn=zh_CN; i18n-prefs=CNY; session-token=g6hxLDDoHhzZLHWxd7FnNbtphW7mG7zCPY29lJB7vwUfa73azlZ8jPh8iS6M+c/4mKa3c/d/Pzgiv61e7sJx858blgOf+pmyxOtu55z5AlVE2nRoPAyWFMeG4OKmZQI3Lg5/MNhcN71PW9x2OkQWWLOeqcikSKmxqaEQL9qGyYcnTbrYggdlInP0pROsR8oz; session-id-time=2082787201l; csm-hit=tb:s-KV6TYQQV77AQ5HHBPD94|1581595664859&t:1581595666568&adb:adblk_yes'
             }
    return headers


#保存txt
def tx(id,text,path):
    print(f"正在保存商品数据..")
    with open(f'{path}{id}.txt','w',encoding='utf-8') as f:
        f.write(text)
    print(f">>>保存商品数据成功!")


#下载图片
def down(img_url,img_name,path):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    print(f"下载{img_name}图片..")
    r=requests.get(img_url,headers=headers,timeout=10)
    with open(f'{path}{img_name}','wb') as f:
        f.write(r.content)
        print(f">>>下载{img_name}图片完成!")
    time.sleep(1)


def get_shopping(id):
    #id="B07S3659V2"
    #url="https://www.amazon.com/dp/B07S3659V2"
    url=f"https://www.amazon.com/dp/{id}"
    html=requests.get(url,headers=ua(),timeout=10).content.decode('utf-8')
    #print(html)
    time.sleep(2)
    req = etree.HTML(html)
    title=re.findall(r'Amazon.com: (.+?)',html,re.S)[0] #B&O PLAY by Bang & Olufsen Beoplay P6 便携式扬声器1140026 黑色
    print(title)
    path=f'{id}/'
    os.makedirs(path,exist_ok=True) #创建目录
    price=re.findall(r'"isPreorder":.+?,"price":(.+?),"doesMAPPolicyApply":.+?',html,re.S)[0]
    price =f'${price}'
    '''
    try:
        price=req.xpath('//span[@id="priceblock_saleprice"]/text()')[0]
    except:
        price = req.xpath('//span[@id="priceblock_ourprice"]/text()')[0]
    '''
    print(price)
    productdescriptions=req.xpath('//div[@id="productDescription"]//text()')
    productdescription='\n'.join(productdescriptions)
    text='%s%s%s%s%s%s%s'%(url,'\n',title,'\n',price,'\n',productdescription)
    tx(id, text, path)
    imgs=req.xpath('//span[@class="a-button-text"]/img/@src')
    for img in imgs:
        if 'jpg' in img:
            imgurl=img.split('._')[0]
            img_url =f'{imgurl}.jpg'
            img_name = img_url.split('/')[-1]
            print(img_url,img_name)
            down(img_url, img_name, path)


    print(f">>>下载图片完毕!")


if __name__=='__main__':
    #id="B07XR5TRSZ"
    id=input("请输入要采集的商品id(比如:B07GJ2MWTZ):")
    get_shopping(id)


你可能感兴趣的:(电商商品爬虫,亚马逊amazon采集源码)