新手学习入门python爬虫实战淘宝

记录一下

  • 重点
  1. 由于淘宝需要登录,我们需要在headers中加入cookie
  2. 获得的json格式,我们可以用在线json解析工具查看结构,这里也包括了两个自己画树状结构的函数
import requests
import bs4
import re
import json
# json_loads()是将json这种字符串格式的转化为python数据结构
# json_dumps()是将python数据结构的数据转化json这种字符串格式
# 打开链接
def open_url(keyword):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
               "Cookie": "cna=ecr8FaGj4BkCAXQBA/ISif7h; t=74708015a4c8f195ffecf6910094518a; cookie2=1ba81d62075785c1aba0fef97e44a639; v=0; _tb_token_=ee75efeb6a6f3; _samesite_flag_=true; sgcookie=Ee1UjSL5Xy%2FoGCIy2YTej; unb=2567975303; uc3=nk2=BM4UqWtIwrLmo%2FTS&id2=UU20srK2IG68%2FA%3D%3D&lg2=Vq8l%2BKCLz3%2F65A%3D%3D&vt3=F8dBxGXFryZCZN77wes%3D; csg=ded34206; lgc=g812cm%5Cu7684%5Cu73AE%5Cu54E5; cookie17=UU20srK2IG68%2FA%3D%3D; dnk=g812cm%5Cu7684%5Cu73AE%5Cu54E5; skt=ff3aa2e9fdb99acd; existShop=MTU4OTM3NDg0Ng%3D%3D; uc4=id4=0%40U2%2Fz993QMiMBcqer33PZH8r8U2VT&nk4=0%40BsogzKLUdUAKNt4RccUT3lfRsxtfiYY%3D; tracknick=g812cm%5Cu7684%5Cu73AE%5Cu54E5; _cc_=WqG3DMC9EA%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%93%A535; _nk_=g812cm%5Cu7684%5Cu73AE%5Cu54E5; cookie1=BxY5GoxuA9R6Jz%2FjbN3SW2nHhWhRwOZ7xqnSsqpvp6E%3D; enc=ueaGTkz%2FLBTfJlmU57xXHLpBRvG8gMuUQ1vbsr%2FC7%2BznvJM9wz9CcNW9oZJziPT5aGuke9p6l6uOqtAPluKTkg%3D%3D; tfstk=c8UOBww_q9XiuqNZ4rIncloQSHClZc_qddMXkszqEo5AFo8Ainzuyu4R5jvtJ6C..; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; mt=ci=112_1; uc1=cookie14=UoTUM2M25mx%2F5g%3D%3D&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&existShop=false&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; JSESSIONID=31174B96F0394FF0592B8156FBA4E94D; l=eBEwguePQlV4qScBBOfwPurza77OSIRAguPzaNbMiT5P9Hfp5khhWZbg1u89C3GVh6D9R3ykIQI_BeYBqIv4n5U62j-la_kmn; isg=BObmTB7XhMrekFBu76YpHChkN1xoxyqB1oftY9CP0onkU4ZtOFd6kcwhq09feyKZ"}
    payload = {'q':keyword,'sort':'sale-desc','tab':'mall'}
    url = "https://s.taobao.com/search"
    res = requests.get(url,params=payload, headers = headers)
    return res
# 获取列表页的所有商品
def get_items(res):
    g_page_config = re.search(r"g_page_config = (.*?);\n", res.text)
    page_config_json = json.loads(g_page_config.group(1))
    page_items = page_config_json['mods']['itemlist']['data']['auctions']
    result = []
    for each_item in page_items:
        dict1 = dict.fromkeys(('nid','title','detail_url','view_price','view_sales','nick'))
        dict1['nid'] = each_item['nid']
        dict1['title'] = each_item['title']
        dict1['detail_url'] = each_item['detail_url']
        dict1['view_price'] = each_item['view_price']
        dict1['view_sales'] = each_item['view_sales']
        dict1['nick'] = each_item['nick']
        result.append(dict1)
    return result
# 统计该页面所有商品的销量
def count_sales(items):
    count = 0
    for each in items:
        if '华为' in each['title']:
            count += int(re.search(r'\d+',each['view_sales']).group())
    return count

def svae_to_text(res):
    with open("taobao.txt","w",encoding="utf-8") as file:
        file.write(res.text)
# 根据节点画树状结构
def get_space_end(level):
    return '  '*level + '-'
def get_space_expand(level):
    return '  '*level + '+'
# 递归调用,获取所有的json键
def find_keys(targets,level):
    keys = iter(targets)
    for each in keys:
        if type(targets[each]) is not dict:
            print(get_space_end(level)+each)
        else:
            next_level = level + 1
            print(get_space_expand(level)+each)
            find_keys(targets[each],next_level)

# 从下载下来的整个页面中,找到需要的部分另存为一个文件,并递归调用find_keys函数画出树状结构
def read_g_page_config_from_text():
    with open("taobao.txt","r",encoding="utf-8") as file1:
        g_page_config = re.search(r"g_page_config = (.*?);\n",file1.read())
        print(g_page_config)
        with open("g_page_config.txt","w",encoding="utf-8") as file2:
            file2.write(g_page_config.group(1))
        page_config_json = json.loads(g_page_config.group(1))
        find_keys(page_config_json,1)
# 这里实现的功能是在笔记本电脑的第一个页面中,看华为电脑的销售量
# 如果需要所有页面,需要改链接中的page,并加循环读取
def main():
    #keyword = input("请输入搜索关键词:")
    total = 0
    keyword = "笔记本电脑"
    res = open_url(keyword)
    #svae_to_text(res)
    #read_g_page_config_from_text()
    items = get_items(res)
    print(items)
    total += count_sales(items)
    print(total)

if __name__ == '__main__':
    main()

你可能感兴趣的:(数据挖掘)