python爬虫爬取steam,epic,origin平台游戏数据

这是我们课程实训的一个功能模块,实现将steam,epic,origin游戏价格信息爬取出来,由于三个网站的构造不一样,加载数据的方式也不一样所以我们需要采用不同的方法来爬取这三个平台的游戏数据

用到的工具包

BeautifulSoup包 提取爬取网页标签的属性值(游戏的价格信息等)
selenium的webdriver  利用脚本实现动态加载数据
requests 爬取网页数据用

mysql 数据表

python爬虫爬取steam,epic,origin平台游戏数据_第1张图片

具体实现步骤

1. 提取steam数据

首页url
https://store.steampowered.com/search/?specials=1&page=1

steam网站的游戏数据是分页的,我们可以通过url拼接进行爬取

获取页面html信息

# 获取页面信息
def getPage(pagenum):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/51.0.2704.63 Safari/537.36'}
    #https://store.steampowered.com/search/?specials=1&page=1
    urlh = "https://store.steampowered.com/search/?specials=1&page="
    url = urlh+str(pagenum);
    print(url)
    reponse = requests.get(url, headers=headers)

    reponse.encoding = 'utf-8'
    return reponse.text

将html信息存入txt文件中
进行这一步是为了方便测试,也是为了如果爬取的页面发生变动或更新,相当于留了一个备份

def saveHtmlCode(html,path):
    file = open(path, "wb")
    file.write(html)

提取页面中的游戏数据
这里面要进行数据清理,去除游戏价格中多余的特殊字符(空格,换行等)

# 获取游戏信息
def getGameInfo1(html,game_list):
    global count
    soup = BeautifulSoup(html, 'html.parser')

    # 游戏列表
    games_Info = soup.find(id='search_resultsRows')
    games_a = games_Info.find_all('a');
    for i in range(0,len(games_a)):
        #print(games_a[i])
        #商品是否打折
        is_free = games_a[i].find('div',class_="col search_price responsive_secondrow");
        if(is_free!=None): continue
        #获取商品src
        game_src = games_a[i].find('img')['src']
        #print("src = "+game_src)
        #获取商品名字
        game_name = games_a[i].find('span',class_='title').get_text()
        #print("name = "+game_name)
        #获取商品折扣
        game_discount = stripAndreplace(games_a[i].find('div',class_="col search_discount responsive_secondrow").get_text())
        #print(game_discount)
        #获取折扣信息
        priceText = stripAndreplace(games_a[i].find('div',class_="col search_price discounted responsive_secondrow").get_text())
        priceText = split(priceText)
        if (len(priceText) < 2): continue
        #获取商品原来价格
        game_original_price = priceText[0].replace(' ','');
        #获取商品折扣价格
        game_final_price = priceText[1].replace(' ','');
        #print("op = "+game_original_price+" fp = "+game_final_price)
        #来源
        game_source = 'steam'

        game_now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M')
        # name, original_price,final_price,discount,source,src
        game = EpicGamePrice.Game(game_name,game_original_price,game_final_price,game_discount,game_source,game_src,game_now_time)
        print(game.printProperty())
        game_list.append(game)

将爬取的游戏信息存入csv文件中

# game_list中代表的是游戏信息list,path是存入的路径
def write_to_excel(game_list,path):
    file = open(path,'w',encoding='utf-8-sig')
    csv_writer = csv.writer(file)
    csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date'])

    for i in range(0,len(game_list)):
        game = game_list[i];
        csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date])

    file.close()

csv文件展示
我是爬了200页数据
python爬虫爬取steam,epic,origin平台游戏数据_第2张图片

2.提取epic数据

首页url
url = "https://www.epicgames.com/store/zh-CN/browse?sortBy=releaseDate&sortDir=DESC&pageSize=30"

因为epic数据是需要点击下面这个加载更多来显示更多游戏的
python爬虫爬取steam,epic,origin平台游戏数据_第3张图片
所以我们第一步需要写一个脚本来点击这个加载按钮,这就用到了我们的webdriver
1. 编写脚本来点击加载更多按钮实现加载更多数据并获取网页源代码

#打开浏览器
def openFireFoxDiver():
    url = "https://www.epicgames.com/store/zh-CN/browse?sortBy=releaseDate&sortDir=DESC&pageSize=30"
    driver = webdriver.Firefox()
    driver.get(url)
    time.sleep(10)
    button = driver.find_element_by_id('browse-pagination')
    button.click()
    #等待15秒让网页加载完全数据
    time.sleep(15)

    html_page = driver.page_source.encode('utf-8')  # 取得网页的源代码
    return html_page;

获取页面游戏信息

#获取页面信息
def getGameInfo(game_list,path):
    html =getHtmlCode(path)
    html_page_soup = BeautifulSoup(html, 'html.parser')
    gameInfo = html_page_soup.find_all('li', class_='css-1adx3p4-BrowseGrid-styles__card')
    print(len(gameInfo))
    #print(gameInfo)

    for i in range(0,len(gameInfo)):
        if(i==2):
            print(gameInfo[i])
        #游戏图片
        game_img = gameInfo[i].find('img')
        #游戏图片链接
        game_src = ''
        #游戏名称
        game_name =''
        #游戏现价
        game_original_price = ''
        if game_img.has_attr('data-image'):
            game_src = game_img['data-image'];
            #print(game_src +"       "+str(i))
        else: continue
        if game_img.has_attr('alt'):
            game_name = game_img['alt']
            #print(game_name + "       " + str(i))
        else: continue
        #游戏折扣
        game_discount = gameInfo[i].find('span',attrs={'data-component':'DiscountAmount'})
        if(game_discount==None): game_discount = 'no discount';
        else: game_discount = game_discount.get_text();
        print(game_discount)
        #游戏现价
        game_final_price = gameInfo[i].find('span',attrs = {'data-component':'Price'})
        if(game_final_price==None): game_final_price = '0'
        else: game_final_price = game_final_price.get_text()
        #print(game_final_price)
        #游戏原价
        game_original_price = gameInfo[i].find('s',attrs ={'data-component':'Price'})
        if(game_original_price==None): game_original_price = game_final_price
        else: game_original_price = game_original_price.get_text()
        #print(game_original_price)
        game_final_price = stripAndreplace(game_final_price)
        game_original_price = stripAndreplace(game_original_price)
        game_final_price = str(round(float(game_final_price)*6.53,2))
        game_original_price = str(round(float(game_original_price)*6.53,2))
        print("op =  " + str(game_original_price) + "  fp = " + str(game_final_price))
        game_source = 'epic'
        game_now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M')
        game = Game(game_name, game_original_price, game_final_price, game_discount, game_source, game_src,game_now_time)

        print(game.printProperty())
        game_list.append(game)

写入csv文件

def write_to_excel(game_list,path):
    file = open(path,'w',encoding='utf-8-sig')
    csv_writer = csv.writer(file)
    csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date'])

    for i in range(0,len(game_list)):
        game = game_list[i];
        csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date])

    file.close()

爬取页面展示

def write_to_excel(game_list,path):
    file = open(path,'w',encoding='utf-8-sig')
    csv_writer = csv.writer(file)
    csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date'])

    for i in range(0,len(game_list)):
        game = game_list[i];
        csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date])

    file.close()

3. 提取origin平台数据

首页url
url = "https://www.origin.com/hkg/en-us/store/deals/holidaysale"

origin平台的游戏数据是通过下拉滑动框来进行动态加载的,因此我们需要写一个脚本来实现定时将滑动框移到最低端,这个操作重复20次就够了,因为origin平台就这么些游戏
编写脚本实现自动下拉滑动框并获取网页源代码

#打开浏览器
def openFireFoxDiver():
    #url = 'https://www.origin.com/hkg/en-us/store/browse?fq=platform:pc-download'
    url = "https://www.origin.com/hkg/en-us/store/deals/holidaysale"
    driver = webdriver.Firefox()
    driver.get(url)
    time.sleep(15)
    for i in range(0,30):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        time.sleep(5)
        print(i)
    html_page = driver.page_source.encode('utf-8')  # 取得网页的源代码
    return html_page;

获取页面游戏信息

#获取页面信息
def getGameInfo(game_list,path):
    html = EpicGamePrice.getHtmlCode(path)
    html_page_soup = BeautifulSoup(html, 'html.parser')
    gameInfo = html_page_soup.find_all('origin-store-bundle-offer')
    # print(len(gameInfo))
    # print(gameInfo[0])
    for i in range(0,len(gameInfo)):
        #游戏图片src
        game_src = gameInfo[i].find('img')['src']
        # print(game_src +"    "+str(i))

        #游戏名称
        game_name = gameInfo[i].find('h2',attrs={'class':'otktitle-4 origin-storebundleoffer-title'}).get_text()
        # print(game_name)

        #游戏现价
        game_final_price = gameInfo[i].find('p', attrs={'class': 'origin-store-offerprice-price otkprice'})
        if(game_final_price==None):
            game_final_price='-1'
        else:
            game_final_price = game_final_price.get_text();
        game_final_price = stripAndreplace(game_final_price)
        # print(game_final_price)

        #游戏折扣

        game_discount = gameInfo[i].find('span', attrs={'class': 'otkprice-sale'});
        if(game_discount==None):
            game_discount = 'Internert not connect'
        else:
            game_discount = gameInfo[i].find('span',attrs={'class':'otkprice-sale'}).get_text();
        game_discount = dealSaveUpto(game_discount)
        # print(game_discount)
        #游戏原价
        [game_original_price,game_final_price] = getOriginPriceAndFinalPrice(game_final_price,game_discount)
        #print(game_original_price)

        #游戏来源
        game_source = 'origin'

        game_now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M')

        game = EpicGamePrice.Game(game_name, game_original_price, game_final_price, game_discount, game_source, game_src,game_now_time)
        print(game.printProperty())
        game_list.append(game)

将数据写入csv文件中

def write_to_excel(game_list,path):
    file = open(path,'w',encoding='utf-8-sig')
    csv_writer = csv.writer(file)
    csv_writer.writerow(["game_name", "game_original_price", "game_final_price","game_discount","game_source",'game_src','game_now_date'])

    for i in range(0,len(game_list)):
        game = game_list[i];
        csv_writer.writerow([game.get_name, game.get_original_price, game.get_final_price,game.get_discount,game.get_source,game.get_src,game.get_now_date])

    file.close()

爬取效果展示
python爬虫爬取steam,epic,origin平台游戏数据_第4张图片

你可能感兴趣的:(python,python,爬虫)