Kaggle solution 1: Mercari Price Suggestion Challenge



import pandas as pd
import numpy as np
import scipy
import math
import time

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import *

import gc

NUM_BRANDS = 2500
NAME_MIN_DF = 10
MAX_FEAT_DESCP = 50000

def PLOG(info):
    localtime = time.asctime(time.localtime(time.time()))
    print(info,':    ',localtime)


def __rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(math.fabs(y_pred[i]) + 1) - math.log(math.fabs(y[i]) + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5


print("Reading in Data")

df_train = pd.read_csv('./train.tsv', sep='\t', nrows=1000)
df_test = pd.read_csv('./test.tsv', sep='\t', nrows=1000)
PLOG('1111111')
df = pd.concat([df_train, df_test], 0)
nrow_train = df_train.shape[0]
y_train = np.log1p(df_train["price"])
PLOG('2222222222')
del df_train
gc.collect()
PLOG('333333333')
print(df.memory_usage(deep = True))


#category_name
df["category_name"] = df["category_name"].fillna("Other").astype("category")
unique_categories = pd.Series("/".join(df["category_name"].unique().astype("str")).split("/")).unique()
count_category = CountVectorizer()
X_category = count_category.fit_transform(df["category_name"])
PLOG('4444444444444')
#brand_name
df["brand_name"] = df["brand_name"].fillna("unknown")
pop_brands = df["brand_name"].value_counts().index[:NUM_BRANDS]
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"
df["brand_name"] = df["brand_name"].astype("category")
vect_brand = LabelBinarizer(sparse_output=True)
X_brand = vect_brand.fit_transform(df["brand_name"])
PLOG('555555555555')

#item_description
df["item_description"] = df["item_description"].fillna("None")
count_descp = TfidfVectorizer(max_features = MAX_FEAT_DESCP,
                              ngram_range = (1,3),
                              stop_words = "english")
X_descp = count_descp.fit_transform(df["item_description"])
PLOG('6666666666666')

#item_condition_id, shipping
df["item_condition_id"] = df["item_condition_id"].astype("category")
X_dummies = scipy.sparse.csr_matrix(pd.get_dummies(df[[
    "item_condition_id", "shipping"]], sparse = True).values)
PLOG('7777777777777')
#name
count = CountVectorizer(min_df=NAME_MIN_DF)
X_name = count.fit_transform(df["name"])
PLOG('8888888888888')



X = scipy.sparse.hstack((X_dummies, 
                         X_descp,
                         X_brand,
                         X_category,
                         X_name)).tocsr()
PLOG('99999999999999')

X_train = X[:nrow_train]
model = Ridge(solver = "lsqr", fit_intercept=False)

model.fit(X_train, y_train)
PLOG('10')

X_test = X[nrow_train:]
preds = model.predict(X_test)


print(__rmsle(y_train,preds))

df_test["price"] = np.expm1(preds)
df_test[["test_id", "price"]].to_csv("submission.csv", index = False)

PLOG('11')

你可能感兴趣的:(Python,Machine,Learning)