下载kaggle数据集

第一步:注册 https://www.kaggle.com/account/login?phase=startRegisterTab

需要科学上网,可以下载https://d3doq0jj4dgobw.cloudfront.net/GreenHub%20Setup%202.1.1.exe

不然google的人机交付过不去。

第二步:去https://www.kaggle.com

找到:  “Account”,找到 “API”下载
点击 “Create New API Token”
 得到 “kaggle.json”   得到 `api_keys`

第三步:安装pip install kaggle  

第四步:日志接口

import logging
import sys
from logging.handlers import TimedRotatingFileHandler
FORMATTER = logging.Formatter("%(asctime)s — %(name)s — %(levelname)s — %(message)s")
LOG_FILE = "logs.log"

def get_console_handler():
   console_handler = logging.StreamHandler(sys.stdout)
   console_handler.setFormatter(FORMATTER)
   return console_handler

def get_file_handler():
   file_handler = TimedRotatingFileHandler(LOG_FILE, when='midnight')
   file_handler.setFormatter(FORMATTER)
   return file_handler

def get_logger(logger_name):
   logger = logging.getLogger(logger_name)
   logger.setLevel(logging.DEBUG) # better to have too much log than not enough
   logger.addHandler(get_console_handler())
   logger.addHandler(get_file_handler())
   # with this pattern, it's rarely necessary to propagate the error up to parent
   logger.propagate = False
   return logger

第五步:下载代码:

import pandas as pd

import requests
import json
import os
from pathlib import Path
import zipfile
from logger_api import get_logger

logger = get_logger(__name__)

DATA_PATH = Path("data")
KAGGLE_JSON = Path("api_keys/kaggle.json")
IS_KAGGLE_KEY = KAGGLE_JSON.exists()
KAGGLE_API = None

DATASETS = {
    "1C Sales Dataset": {
        "source": "kaggle",
        "name": "competitive-data-science-predict-future-sales",
        "path": "1c_sales_dataset",
        "filename": "competitive-data-science-predict-future-sales.zip",
    },
    "Montreal Bixi Bike Data": {
        "source": "kaggle",
        "name": "supercooler8/bixi-bike-montreal",
        "path": "bixi_bike_data",
        "filename": "bixi-bike-montreal.zip",
    },
    "Turkish Retail Sales": {
        "source": "Kaggle",
        "name": "berkayalan/retail-sales-data",
        "path": "turkish_retail_sales",
        "filename": "retail-sales-data.zip",
    },
    "Sunspot": {
        "source": "Monash Forecasting Repository",
        "url": "https://zenodo.org/record/4654773/files/sunspot_dataset_with_missing_values.zip?download=1",
        "path": "sunspot",
        "filename": "sunspot.zip",
    },
    "Electricity Demand": {
        "source": "Monash Forecasting Repository",
        "url": "https://zenodo.org/record/4656069/files/elecdemand_dataset.zip?download=1",
        "path": "electricity_demand",
        "filename": "electricity_demand.zip",
    },
    "Dominick Sales": {
        "source": "Monash Forecasting Repository",
        "url": "https://zenodo.org/record/4654802/files/dominick_dataset.zip?download=1",
        "path": "dominick_sales",
        "filename": "dominick_sales.zip",
    },
    # "London Smart Meters": {
    #     "source": "Monash Forecasting Repository",
    #     "url": "https://zenodo.org/record/4656072/files/london_smart_meters_dataset_with_missing_values.zip?download=1",
    #     "path": "london_smart_meters",
    #     "filename": "london_smart_meters.zip",
    # },
    "London Smart Meters": {
        "source": "Kaggle",
        "name": "jeanmidev/smart-meters-in-london",
        "path": "london_smart_meters",
        "filename": "smart-meters-in-london.zip",
    },
    "Tourism": {
        "source": "Monash Forecasting Repository",
        "url": "https://zenodo.org/record/4656096/files/tourism_monthly_dataset.zip?download=1",
        "path": "tourism",
        "filename": "tourism.zip",
    },
}


def get_kaggle_username_key(username=None, key=None):
    _authenticate_api = False
    if ("KAGGLE_USERNAME" in os.environ) and ("KAGGLE_KEY" in os.environ):
        logger.info("Kaggle Username and Key already set as environment variables")
        _authenticate_api = True
    elif (username is not None) and (key is not None):
        logger.info("Kaggle Username and Key retrieved from parameters")
        _authenticate_api = True
    elif IS_KAGGLE_KEY:
        with open(KAGGLE_JSON, "r") as f:
            kaggle_dict = json.load(f)
            username = kaggle_dict["username"]
            key = kaggle_dict["key"]
        logger.info("Kaggle Username and Key retrieved from kaggle.json")
        _authenticate_api = True
    else:
        logger.warning(
            "kaggle.json not found in api_keys folder, username and key is not passed as parameter or is not set as required environment variables"
        )
    return username, key, _authenticate_api


def get_authenticated_kaggle_api(username=None, key=None):
    global KAGGLE_API
    username, key, _authenticate_api = get_kaggle_username_key(username, key)
    if _authenticate_api and KAGGLE_API is None:
        os.environ["KAGGLE_USERNAME"] = username
        os.environ["KAGGLE_KEY"] = key
        from kaggle.api.kaggle_api_extended import KaggleApi

        KAGGLE_API = KaggleApi()
        KAGGLE_API.authenticate()
    return KAGGLE_API


def _download_competition_dataset(api, dataset_details):
    api.competition_download_files(
        dataset_details["name"],
        path=DATA_PATH / dataset_details["path"],
        quiet=False,
    )


def _download_dataset(api, dataset_details):
    api.dataset_download_files(
        dataset_details["name"],
        path=DATA_PATH / dataset_details["path"],
        quiet=False,
        unzip=True,
    )


def _unzip(path, filename, delete_zip=True):
    with zipfile.ZipFile(
        str(DATA_PATH / path / filename),
        "r",
    ) as zip_ref:
        zip_ref.extractall(DATA_PATH / path)
    if delete_zip:
        (DATA_PATH / path / filename).unlink()


def download_kaggle_dataset(
    dataset_details, username=None, key=None, competition=False
):
    api = get_authenticated_kaggle_api(username, key)
    if api is not None:
        if competition:
            _download_competition_dataset(api, dataset_details)
            logger.info("Donwload completed. Unzipping..")
            _unzip(dataset_details["path"], dataset_details["filename"], delete_zip=True)
        else:
            _download_dataset(api, dataset_details)
    else:
        raise ValueError(
            "Kaggle API wasn't able to authenticate. Please provide username and key."
        )


def _download(url, filename):
    import functools
    import pathlib
    import shutil
    import requests
    from tqdm.auto import tqdm
    
    r = requests.get(url, stream=True, allow_redirects=True)
    if r.status_code != 200:
        r.raise_for_status()  # Will only raise for 4xx codes, so...
        raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
    file_size = int(r.headers.get('Content-Length', 0))

    path = pathlib.Path(filename).expanduser().resolve()
    path.parent.mkdir(parents=True, exist_ok=True)

    desc = "(Unknown total file size)" if file_size == 0 else ""
    r.raw.read = functools.partial(r.raw.read, decode_content=True)  # Decompress if needed
    with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
        with path.open("wb") as f:
            shutil.copyfileobj(r_raw, f)

    return path

def download_monash_dataset(dataset_details):
    _download(dataset_details["url"],DATA_PATH/dataset_details["path"]/dataset_details["filename"])
    _unzip(dataset_details["path"], dataset_details["filename"], delete_zip=True)


def download_1c_sales(username=None, key=None):
    logger.info("Downloading 1C Sales Dataset...")
    dataset_details = DATASETS["1C Sales Dataset"]
    download_kaggle_dataset(dataset_details, username, key, competition=True)


def download_bixi_bike(username=None, key=None):
    logger.info("Downloading Montreal Bixi Bike Data Dataset...")
    dataset_details = DATASETS["Montreal Bixi Bike Data"]
    download_kaggle_dataset(dataset_details, username, key, competition=False)

def download_london_smart_meters(username=None, key=None):
    logger.info("Downloading London Smart Meters Dataset...")
    dataset_details = DATASETS["London Smart Meters"]
    download_kaggle_dataset(dataset_details, username, key, competition=False)

def download_sunspot():
    logger.info("Downloading Sunspot Dataset...")
    dataset_details = DATASETS["Sunspot"]
    download_monash_dataset(dataset_details)

def download_electricity_demand():
    logger.info("Downloading Electricity Demand Dataset...")
    dataset_details = DATASETS["Electricity Demand"]
    download_monash_dataset(dataset_details)

def download_dominick_sales():
    logger.info("Downloading Dominick Sales Dataset...")
    dataset_details = DATASETS["Dominick Sales"]
    download_monash_dataset(dataset_details)

def download_turkish_sales_data(username=None, key=None):
    logger.info("Downloading Turkish Sales Dataset...")
    dataset_details = DATASETS["Turkish Retail Sales"]
    download_kaggle_dataset(dataset_details, username, key, competition=False)
    
def download_london_smart_meters(username=None, key=None):
    logger.info("Downloading London Smart Meters Dataset...")
    dataset_details = DATASETS["London Smart Meters"]
    download_kaggle_dataset(dataset_details, username, key, competition=False)
    #Supplementary cleanup
    # os.remove(DATA_PATH/dataset_details['path']/"hhblock_dataset.zip")
    # os.remove(DATA_PATH/dataset_details['path']/"halfhourly_dataset.zip")
    # os.remove(DATA_PATH/dataset_details['path']/"daily_dataset.zip")
    # os.remove(DATA_PATH/dataset_details['path']/"daily_dataset.csv.gz")

def download_tourism():
    logger.info("Downloading Tourism Dataset...")
    dataset_details = DATASETS["Tourism"]
    download_monash_dataset(dataset_details)
    download_1c_sales()
	#download_bixi_bike()
	#download_sunspot()
	#download_electricity_demand()
	#download_dominick_sales()
	#download_turkish_sales_data()
	#download_london_smart_meters()
	#download_tourism()
if __name__ == "__main__":
    download_tourism()

你可能感兴趣的:(人工智能,kaggle,python,机器学习)