第一次参加Kaggle里feature类的比赛,把参赛经验详细整理如下:
比赛Stage1截止前一天有人放出一个非常好的特征工程的源码!运行速度比其他方法快很多!下面学习之。
先贴源码如下:
import multiprocessing as mp
import pandas as pd
from time import time
from scipy.sparse import csr_matrix
import os
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import gc
from sklearn.base import BaseEstimator, TransformerMixin
import re
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['JOBLIB_START_METHOD'] = 'forkserver'
INPUT_PATH = r'../input'
def dameraulevenshtein(seq1, seq2):
"""Calculate the Damerau-Levenshtein distance between sequences.
This method has not been modified from the original.
Source: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
This distance is the number of additions, deletions, substitutions,
and transpositions needed to transform the first sequence into the
second. Although generally used with strings, any sequences of
comparable objects will work.
Transpositions are exchanges of *consecutive* characters; all other
operations are self-explanatory.
This implementation is O(N*M) time and O(M) space, for N and M the
lengths of the two sequences.
>>> dameraulevenshtein('ba', 'abc')
2
>>> dameraulevenshtein('fee', 'deed')
2
It works with arbitrary sequences too:
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
2
"""
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
# However, only the current and two previous rows are needed at once,
# so we only store those.
oneago = None
thisrow = list(range(1, len(seq2) + 1)) + [0]
for x in range(len(seq1)):
# Python lists wrap around for negative indices, so put the
# leftmost column at the *end* of the list. This matches with
# the zero-indexed strings and saves extra calculation.
twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x + 1])
for y in range(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
# This block deals with transpositions
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
return thisrow[len(seq2) - 1]
class SymSpell:
def __init__(self, max_edit_distance=3, verbose=0):
self.max_edit_distance = max_edit_distance
self.verbose = verbose
# 0: top suggestion
# 1: all suggestions of smallest edit distance
# 2: all suggestions <= max_edit_distance (slower, no early termination)
self.dictionary = {}
self.longest_word_length = 0
def get_deletes_list(self, w):
"""given a word, derive strings with up to max_edit_distance characters
deleted"""
deletes = []
queue = [w]
for d in range(self.max_edit_distance):
temp_queue = []
for word in queue:
if len(word) > 1:
for c in range(len(word)): # character index
word_minus_c = word[:c] + word[c + 1:]
if word_minus_c not in deletes:
deletes.append(word_minus_c)
if word_minus_c not in temp_queue:
temp_queue.append(word_minus_c)
queue = temp_queue
return deletes
def create_dictionary_entry(self, w):
'''add word and its derived deletions to dictionary'''
# check if word is already in dictionary
# dictionary entries are in the form: (list of suggested corrections,
# frequency of word in corpus)
new_real_word_added = False
if w in self.dictionary:
# increment count of word in corpus
self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)
else:
self.dictionary[w] = ([], 1)
self.longest_word_length = max(self.longest_word_length, len(w))
if self.dictionary[w][1] == 1:
# first appearance of word in corpus
# n.b. word may already be in dictionary as a derived word
# (deleting character from a real word)
# but counter of frequency of word in corpus is not incremented
# in those cases)
new_real_word_added = True
deletes = self.get_deletes_list(w)
for item in deletes:
if item in self.dictionary:
# add (correct) word to delete's suggested correction list
self.dictionary[item][0].append(w)
else:
# note frequency of word in corpus is not incremented
self.dictionary[item] = ([w], 0)
return new_real_word_added
def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):
total_word_count = 0
unique_word_count = 0
for line in arr:
# separate by words by non-alphabetical characters
words = re.findall(token_pattern, line.lower())
for word in words:
total_word_count += 1
if self.create_dictionary_entry(word):
unique_word_count += 1
print("total words processed: %i" % total_word_count)
print("total unique words in corpus: %i" % unique_word_count)
print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
print(" edit distance for deletions: %i" % self.max_edit_distance)
print(" length of longest word in corpus: %i" % self.longest_word_length)
return self.dictionary
def create_dictionary(self, fname):
total_word_count = 0
unique_word_count = 0
with open(fname) as file:
for line in file:
# separate by words by non-alphabetical characters
words = re.findall('[a-z]+', line.lower())
for word in words:
total_word_count += 1
if self.create_dictionary_entry(word):
unique_word_count += 1
print("total words processed: %i" % total_word_count)
print("total unique words in corpus: %i" % unique_word_count)
print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
print(" edit distance for deletions: %i" % self.max_edit_distance)
print(" length of longest word in corpus: %i" % self.longest_word_length)
return self.dictionary
def get_suggestions(self, string, silent=False):
"""return list of suggested corrections for potentially incorrectly
spelled word"""
if (len(string) - self.longest_word_length) > self.max_edit_distance:
if not silent:
print("no items in dictionary within maximum edit distance")
return []
suggest_dict = {}
min_suggest_len = float('inf')
queue = [string]
q_dictionary = {} # items other than string that we've checked
while len(queue) > 0:
q_item = queue[0] # pop
queue = queue[1:]
# early exit
if ((self.verbose < 2) and (len(suggest_dict) > 0) and
((len(string) - len(q_item)) > min_suggest_len)):
break
# process queue item
if (q_item in self.dictionary) and (q_item not in suggest_dict):
if self.dictionary[q_item][1] > 0:
# word is in dictionary, and is a word from the corpus, and
# not already in suggestion list so add to suggestion
# dictionary, indexed by the word with value (frequency in
# corpus, edit distance)
# note q_items that are not the input string are shorter
# than input string since only deletes are added (unless
# manual dictionary corrections are added)
assert len(string) >= len(q_item)
suggest_dict[q_item] = (self.dictionary[q_item][1],
len(string) - len(q_item))
# early exit
if (self.verbose < 2) and (len(string) == len(q_item)):
break
elif (len(string) - len(q_item)) < min_suggest_len:
min_suggest_len = len(string) - len(q_item)
# the suggested corrections for q_item as stored in
# dictionary (whether or not q_item itself is a valid word
# or merely a delete) can be valid corrections
for sc_item in self.dictionary[q_item][0]:
if sc_item not in suggest_dict:
# compute edit distance
# suggested items should always be longer
# (unless manual corrections are added)
assert len(sc_item) > len(q_item)
# q_items that are not input should be shorter
# than original string
# (unless manual corrections added)
assert len(q_item) <= len(string)
if len(q_item) == len(string):
assert q_item == string
item_dist = len(sc_item) - len(q_item)
# item in suggestions list should not be the same as
# the string itself
assert sc_item != string
# calculate edit distance using, for example,
# Damerau-Levenshtein distance
item_dist = dameraulevenshtein(sc_item, string)
# do not add words with greater edit distance if
# verbose setting not on
if (self.verbose < 2) and (item_dist > min_suggest_len):
pass
elif item_dist <= self.max_edit_distance:
assert sc_item in self.dictionary # should already be in dictionary if in suggestion list
suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)
if item_dist < min_suggest_len:
min_suggest_len = item_dist
# depending on order words are processed, some words
# with different edit distances may be entered into
# suggestions; trim suggestion dictionary if verbose
# setting not on
if self.verbose < 2:
suggest_dict = {k: v for k, v in suggest_dict.items() if v[1] <= min_suggest_len}
# now generate deletes (e.g. a substring of string or of a delete)
# from the queue item
# as additional items to check -- add to end of queue
assert len(string) >= len(q_item)
# do not add words with greater edit distance if verbose setting
# is not on
if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):
pass
elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:
for c in range(len(q_item)): # character index
word_minus_c = q_item[:c] + q_item[c + 1:]
if word_minus_c not in q_dictionary:
queue.append(word_minus_c)
q_dictionary[word_minus_c] = None # arbitrary value, just to identify we checked this
# queue is now empty: convert suggestions in dictionary to
# list for output
if not silent and self.verbose != 0:
print("number of possible corrections: %i" % len(suggest_dict))
print(" edit distance for deletions: %i" % self.max_edit_distance)
# output option 1
# sort results by ascending order of edit distance and descending
# order of frequency
# and return list of suggested word corrections only:
# return sorted(suggest_dict, key = lambda x:
# (suggest_dict[x][1], -suggest_dict[x][0]))
# output option 2
# return list of suggestions with (correction,
# (frequency in corpus, edit distance)):
as_list = suggest_dict.items()
# outlist = sorted(as_list, key=lambda (term, (freq, dist)): (dist, -freq))
outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))
if self.verbose == 0:
return outlist[0]
else:
return outlist
'''
Option 1:
['file', 'five', 'fire', 'fine', ...]
Option 2:
[('file', (5, 0)),
('five', (67, 1)),
('fire', (54, 1)),
('fine', (17, 1))...]
'''
def best_word(self, s, silent=False):
try:
return self.get_suggestions(s, silent)[0]
except:
return None
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, field, start_time=time()):
self.field = field
self.start_time = start_time
def fit(self, x, y=None):
return self
def transform(self, dataframe):
print(f'[{time()-self.start_time}] select {self.field}')
dt = dataframe[self.field].dtype
if is_categorical_dtype(dt):
return dataframe[self.field].cat.codes[:, None]
elif is_numeric_dtype(dt):
return dataframe[self.field][:, None]
else:
return dataframe[self.field]
class DropColumnsByDf(BaseEstimator, TransformerMixin):
def __init__(self, min_df=1, max_df=1.0):
self.min_df = min_df
self.max_df = max_df
def fit(self, X, y=None):
m = X.tocsc()
self.nnz_cols = ((m != 0).sum(axis=0) >= self.min_df).A1
if self.max_df < 1.0:
max_df = m.shape[0] * self.max_df
self.nnz_cols = self.nnz_cols & ((m != 0).sum(axis=0) <= max_df).A1
return self
def transform(self, X, y=None):
m = X.tocsc()
return m[:, self.nnz_cols]
def get_rmsle(y_true, y_pred):
return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))
def split_cat(text):
try:
cats = text.split("/")
return cats[0], cats[1], cats[2], cats[0] + '/' + cats[1]
except:
print("no category")
return 'other', 'other', 'other', 'other/other'
def brands_filling(dataset):
vc = dataset['brand_name'].value_counts()
brands = vc[vc > 0].index
brand_word = r"[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+"
many_w_brands = brands[brands.str.contains(' ')]
one_w_brands = brands[~brands.str.contains(' ')]
ss2 = SymSpell(max_edit_distance=0)
ss2.create_dictionary_from_arr(many_w_brands, token_pattern=r'.+')
ss1 = SymSpell(max_edit_distance=0)
ss1.create_dictionary_from_arr(one_w_brands, token_pattern=r'.+')
two_words_re = re.compile(r"(?=(\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+))")
def find_in_str_ss2(row):
for doc_word in two_words_re.finditer(row):
print(doc_word)
suggestion = ss2.best_word(doc_word.group(1), silent=True)
if suggestion is not None:
return doc_word.group(1)
return ''
def find_in_list_ss1(list):
for doc_word in list:
suggestion = ss1.best_word(doc_word, silent=True)
if suggestion is not None:
return doc_word
return ''
def find_in_list_ss2(list):
for doc_word in list:
suggestion = ss2.best_word(doc_word, silent=True)
if suggestion is not None:
return doc_word
return ''
print(f"Before empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")
n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(
pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+")
dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_name]
n_desc = dataset[dataset['brand_name'] == '']['item_description'].str.findall(
pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+")
dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_desc]
n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(pat=brand_word)
dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in n_name]
desc_lower = dataset[dataset['brand_name'] == '']['item_description'].str.findall(pat=brand_word)
dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in desc_lower]
print(f"After empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")
del ss1, ss2
gc.collect()
def preprocess_regex(dataset, start_time=time()):
karats_regex = r'(\d)([\s-]?)(karat|karats|carat|carats|kt)([^\w])'
karats_repl = r'\1k\4'
unit_regex = r'(\d+)[\s-]([a-z]{2})(\s)'
unit_repl = r'\1\2\3'
dataset['name'] = dataset['name'].str.replace(karats_regex, karats_repl)
dataset['item_description'] = dataset['item_description'].str.replace(karats_regex, karats_repl)
print(f'[{time() - start_time}] Karats normalized.')
dataset['name'] = dataset['name'].str.replace(unit_regex, unit_repl)
dataset['item_description'] = dataset['item_description'].str.replace(unit_regex, unit_repl)
print(f'[{time() - start_time}] Units glued.')
def preprocess_pandas(train, test, start_time=time()):
train = train[train.price > 0.0].reset_index(drop=True)
print('Train shape without zero price: ', train.shape)
nrow_train = train.shape[0]
y_train = np.log1p(train["price"])
merge: pd.DataFrame = pd.concat([train, test])
del train
del test
gc.collect()
merge['has_category'] = (merge['category_name'].notnull()).astype('category')
print(f'[{time() - start_time}] Has_category filled.')
merge['category_name'] = merge['category_name'] \
.fillna('other/other/other') \#填补空缺值
.str.lower() \#都变成小写
.astype(str)
merge['general_cat'], merge['subcat_1'], merge['subcat_2'], merge['gen_subcat1'] = \
zip(*merge['category_name'].apply(lambda x: split_cat(x)))
print(f'[{time() - start_time}] Split categories completed.')
merge['has_brand'] = (merge['brand_name'].notnull()).astype('category')
print(f'[{time() - start_time}] Has_brand filled.')
merge['gencat_cond'] = merge['general_cat'].map(str) + '_' + merge['item_condition_id'].astype(str)
merge['subcat_1_cond'] = merge['subcat_1'].map(str) + '_' + merge['item_condition_id'].astype(str)
merge['subcat_2_cond'] = merge['subcat_2'].map(str) + '_' + merge['item_condition_id'].astype(str)
print(f'[{time() - start_time}] Categories and item_condition_id concancenated.')
merge['name'] = merge['name'] \
.fillna('') \
.str.lower() \
.astype(str)
merge['brand_name'] = merge['brand_name'] \
.fillna('') \
.str.lower() \
.astype(str)
merge['item_description'] = merge['item_description'] \
.fillna('') \
.str.lower() \
.replace(to_replace='No description yet', value='')
print(f'[{time() - start_time}] Missing filled.')
preprocess_regex(merge, start_time)
brands_filling(merge)
print(f'[{time() - start_time}] Brand name filled.')
merge['name'] = merge['name'] + ' ' + merge['brand_name']
print(f'[{time() - start_time}] Name concancenated.')
merge['item_description'] = merge['item_description'] \
+ ' ' + merge['name'] \
+ ' ' + merge['subcat_1'] \
+ ' ' + merge['subcat_2'] \
+ ' ' + merge['general_cat'] \
+ ' ' + merge['brand_name']
print(f'[{time() - start_time}] Item description concatenated.')
merge.drop(['price', 'test_id', 'train_id'], axis=1, inplace=True)
return merge, y_train, nrow_train
def intersect_drop_columns(train: csr_matrix, valid: csr_matrix, min_df=0):
t = train.tocsc()
v = valid.tocsc()
nnz_train = ((t != 0).sum(axis=0) >= min_df).A1
nnz_valid = ((v != 0).sum(axis=0) >= min_df).A1
nnz_cols = nnz_train & nnz_valid
res = t[:, nnz_cols], v[:, nnz_cols]
return res
if __name__ == '__main__':
mp.set_start_method('forkserver', True)
start_time = time()
train = pd.read_table(os.path.join(INPUT_PATH, 'train.tsv'),
engine='c',
dtype={'item_condition_id': 'category',
'shipping': 'category'}
)
test = pd.read_table(os.path.join(INPUT_PATH, 'test.tsv'),
engine='c',
dtype={'item_condition_id': 'category',
'shipping': 'category'}
)
print(f'[{time() - start_time}] Finished to load data')
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
submission: pd.DataFrame = test[['test_id']]
merge, y_train, nrow_train = preprocess_pandas(train, test, start_time)
meta_params = {'name_ngram': (1, 2),
'name_max_f': 75000,
'name_min_df': 10,
'category_ngram': (2, 3),
'category_token': '.+',
'category_min_df': 10,
'brand_min_df': 10,
'desc_ngram': (1, 3),
'desc_max_f': 150000,
'desc_max_df': 0.5,
'desc_min_df': 10}
stopwords = frozenset(['the', 'a', 'an', 'is', 'it', 'this', ])
# 'i', 'so', 'its', 'am', 'are'])
vectorizer = FeatureUnion([
('name', Pipeline([
('select', ItemSelector('name', start_time=start_time)),
('transform', HashingVectorizer(
ngram_range=(1, 2),
n_features=2 ** 28,
norm='l2',
lowercase=False,
stop_words=stopwords
)),
('drop_cols', DropColumnsByDf(min_df=2))
])),
('category_name', Pipeline([
('select', ItemSelector('category_name', start_time=start_time)),
('transform', HashingVectorizer(
ngram_range=(1, 1),
token_pattern='.+',
tokenizer=split_cat,
n_features=2 ** 28,
norm='l2',
lowercase=False
)),
('drop_cols', DropColumnsByDf(min_df=2))
])),
('brand_name', Pipeline([
('select', ItemSelector('brand_name', start_time=start_time)),
('transform', CountVectorizer(
token_pattern='.+',
min_df=2,
lowercase=False
)),
])),
('gencat_cond', Pipeline([
('select', ItemSelector('gencat_cond', start_time=start_time)),
('transform', CountVectorizer(
token_pattern='.+',
min_df=2,
lowercase=False
)),
])),
('subcat_1_cond', Pipeline([
('select', ItemSelector('subcat_1_cond', start_time=start_time)),
('transform', CountVectorizer(
token_pattern='.+',
min_df=2,
lowercase=False
)),
])),
('subcat_2_cond', Pipeline([
('select', ItemSelector('subcat_2_cond', start_time=start_time)),
('transform', CountVectorizer(
token_pattern='.+',
min_df=2,
lowercase=False
)),
])),
('has_brand', Pipeline([
('select', ItemSelector('has_brand', start_time=start_time)),
('ohe', OneHotEncoder())
])),
('shipping', Pipeline([
('select', ItemSelector('shipping', start_time=start_time)),
('ohe', OneHotEncoder())
])),
('item_condition_id', Pipeline([
('select', ItemSelector('item_condition_id', start_time=start_time)),
('ohe', OneHotEncoder())
])),
('item_description', Pipeline([
('select', ItemSelector('item_description', start_time=start_time)),
('hash', HashingVectorizer(
ngram_range=(1, 3),
n_features=2 ** 27,
dtype=np.float32,
norm='l2',
lowercase=False,
stop_words=stopwords
)),
('drop_cols', DropColumnsByDf(min_df=2)),
]))
], n_jobs=1)
sparse_merge = vectorizer.fit_transform(merge)
print(f'[{time() - start_time}] Merge vectorized')
print(sparse_merge.shape)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(sparse_merge)
print(f'[{time() - start_time}] TF/IDF completed')
X_train = X[:nrow_train]
print(X_train.shape)
X_test = X[nrow_train:]
del merge
del sparse_merge
del vectorizer
del tfidf_transformer
gc.collect()
X_train, X_test = intersect_drop_columns(X_train, X_test, min_df=1)
print(f'[{time() - start_time}] Drop only in train or test cols: {X_train.shape[1]}')
gc.collect()
ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01)
ridge.fit(X_train, y_train)
print(f'[{time() - start_time}] Train Ridge completed. Iterations: {ridge.n_iter_}')
predsR = ridge.predict(X_test)
print(f'[{time() - start_time}] Predict Ridge completed.')
submission.loc[:, 'price'] = np.expm1(predsR)
submission.loc[submission['price'] < 0.0, 'price'] = 0.0
submission.to_csv("submission_ridge.csv", index=False)
我把训练集的前5行单独提出来文件名为train
记录开始时间: start_time = time()
如果以以下方式直接读取
train=pd.read_csv(“train.csv”,sep=’\t’)
查看每列的类型可以看出
train.dtypes
Out[2]:
train_id int64
name object
item_condition_id int64
category_name object
brand_name object
price float64
shipping int64
item_description object
而作者的读取方式用engine=’c’理论上可以加快读取速度;通过设置dtype可以改变列在DataFrame里的类型。为了加快看结果的速度,只取前5行哈(不影响结果)
INPUT_PATH = r'../input'
train = pd.read_table(os.path.join(INPUT_PATH, 'train.tsv'),
engine='c',
dtype={'item_condition_id': 'category',
'shipping': 'category'}
)
train5=train.head(5)
train5.dtypes
Out[2]:
train_id int64
name object
item_condition_id category
category_name object
brand_name object
price float64
shipping category
item_description object
dtype: object
train5.shape
Out[3]: (5, 8)
#经试验,这种行为生成一个DataFrame,名字叫submission,且有列名就是test_id
submission: pd.DataFrame = test[['test_id']]
submission
Out[8]:
test_id
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
20 20
21 21
22 22
23 23
24 24
25 25
26 26
27 27
28 28
29 29
...
291098 291098
291099 291099
291100 291100
291101 291101
291102 291102
291103 291103
291104 291104
291105 291105
291106 291106
291107 291107
291108 291108
291109 291109
291110 291110
291111 291111
291112 291112
291113 291113
291114 291114
291115 291115
291116 291116
291117 291117
291118 291118
291119 291119
291120 291120
291121 291121
291122 291122
291123 291123
291124 291124
291125 291125
291126 291126
291127 291127
[291128 rows x 1 columns]
#可能你会想为什么要这么写,不妨看看如果不这么写:
type(submission)
Out[9]: pandas.core.frame.DataFrame
type(test['test_id'])
Out[10]: pandas.core.series.Series
#Series并不是我们想要的类型,因为Series如果输出成csv是没有列名的,而我们想要列名,所以需要DataFrame类型
train5['has_category'] = train5['category_name'].notnull().astype('category')
#额外附加:
train5['category_name'].notnull()
Out[4]:
0 True
1 True
2 True
3 True
4 True
Name: category_name, dtype: bool
如果不加.astype(‘category’),train5[‘has_category’]的dtype为’bool’
train = pd.read_table( 'train.tsv',
engine='c',
dtype={'item_condition_id': 'category',
'shipping': 'category'}
)
train150=train.head(150)
NUM_BRANDS=10
train150['brand_name'].fillna(value='missing', inplace=True)
pop_brand = train150['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
###过程分析:第一步,数品牌出现的次数
train150['brand_name'].value_counts()
Out[1]:
missing 58
Victoria's Secret 5
PINK 4
Sephora 4
Adidas 3
Nike 3
Nintendo 2
Fossil 2
Apple 2
Under Armour 2
FOREVER 21 2
Hollister 2
Brandy Melville 2
Air Jordan 2
Rae Dunn 2
Target 2
Michael Kors 2
LuLaRoe 2
Lululemon 2
Too Faced 2
Gap 1
Levi's® 1
Rock Revival 1
Kylie Cosmetics 1
Spin Master 1
Anthropologie 1
Urban Outfitters 1
Samsung 1
Littlest Pet Shop 1
Ralph Lauren 1
..
Nostalgia Electrics 1
Motherhood Maternity 1
H&M 1
Melissa & Doug 1
The North Face 1
Barbie 1
Acacia Swimwear 1
Miss Me 1
Pokemon 1
Manolo Blahnik 1
MARC JACOBS 1
Merona 1
Scholastic 1
Faded Glory 1
Salvatore Ferragamo 1
Torrid 1
Xbox 1
Bath & Body Works 1
Razer 1
rue 1
patagonia 1
Soffe 1
Steve Madden 1
Nars 1
LC Lauren Conrad 1
lululemon athletica 1
Smashbox 1
Infantino 1
Silver Jeans Co. 1
Boulevard Boutique 1
Name: brand_name, Length: 65, dtype: int64
#第二步:把missing删去
train150['brand_name'].value_counts().loc[lambda x: x.index != 'missing']
Out[2]:
Victoria's Secret 5
PINK 4
Sephora 4
Adidas 3
Nike 3
Nintendo 2
Fossil 2
Apple 2
Under Armour 2
FOREVER 21 2
Hollister 2
Brandy Melville 2
Air Jordan 2
Rae Dunn 2
Target 2
Michael Kors 2
LuLaRoe 2
Lululemon 2
Too Faced 2
Gap 1
Levi's® 1
Rock Revival 1
Kylie Cosmetics 1
Spin Master 1
Anthropologie 1
Urban Outfitters 1
Samsung 1
Littlest Pet Shop 1
Ralph Lauren 1
Tarte 1
..
Nostalgia Electrics 1
Motherhood Maternity 1
H&M 1
Melissa & Doug 1
The North Face 1
Barbie 1
Acacia Swimwear 1
Miss Me 1
Pokemon 1
Manolo Blahnik 1
MARC JACOBS 1
Merona 1
Scholastic 1
Faded Glory 1
Salvatore Ferragamo 1
Torrid 1
Xbox 1
Bath & Body Works 1
Razer 1
rue 1
patagonia 1
Soffe 1
Steve Madden 1
Nars 1
LC Lauren Conrad 1
lululemon athletica 1
Smashbox 1
Infantino 1
Silver Jeans Co. 1
Boulevard Boutique 1
Name: brand_name, Length: 64, dtype: int64
#第三步:截取出现次数最高的10个品牌名
train150['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
Out[3]:
Index(['Victoria's Secret', 'PINK', 'Sephora', 'Adidas', 'Nike', 'Nintendo',
'Fossil', 'Apple', 'Under Armour', 'FOREVER 21'],
dtype='object')
train150.loc[~train150['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
#第一步:选出'brand_name'在pop_brand里的
train150['brand_name'].isin(pop_brand)
Out[1]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 True
9 False
10 False
11 True
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 True
22 False
23 False
24 True
25 True
26 True
27 True
28 True
29 True
...
120 False
121 False
122 False
123 False
124 False
125 False
126 False
127 False
128 False
129 False
130 False
131 True
132 False
133 False
134 False
135 False
136 False
137 True
138 False
139 False
140 False
141 False
142 False
143 False
144 False
145 True
146 False
147 False
148 False
149 True
Name: brand_name, Length: 150, dtype: bool
#第二步:因为只有isin,没有notin,所以想求notin要对isin做取反操作,即“~”
~train150['brand_name'].isin(pop_brand)
Out[2]:
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 False
9 True
10 True
11 False
12 True
13 True
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 False
22 True
23 True
24 False
25 False
26 False
27 False
28 False
29 False
...
120 True
121 True
122 True
123 True
124 True
125 True
126 True
127 True
128 True
129 True
130 True
131 False
132 True
133 True
134 True
135 True
136 True
137 False
138 True
139 True
140 True
141 True
142 True
143 True
144 True
145 False
146 True
147 True
148 True
149 False
Name: brand_name, Length: 150, dtype: bool
#第三步:获得不在pop_brand所在行的brandname,注意看index为8的不被列出
train150.loc[~train150['brand_name'].isin(pop_brand), 'brand_name']
Out[3]:
0 missing
1 Razer
2 Target
3 missing
4 missing
5 missing
6 Acacia Swimwear
7 Soffe
9 missing
10 Smashbox
12 rue
13 Scholastic
14 UGG Australia
15 Tarte
16 Wet n Wild
17 missing
18 Too Faced
19 Anthropologie
20 Torrid
22 Samsung
23 missing
30 Too Faced
31 missing
34 missing
35 missing
36 missing
37 Under Armour
38 Xbox
40 Kylie Cosmetics
42 missing
...
113 MARC JACOBS
114 missing
115 Steve Madden
117 missing
120 No Boundaries
121 missing
122 missing
123 Faded Glory
124 Miss Me
125 Salvatore Ferragamo
126 Urban Outfitters
127 missing
128 missing
129 missing
130 missing
132 missing
133 Bath & Body Works
134 Fossil
135 Nars
136 Brandy Melville
138 missing
139 missing
140 Nintendo
141 Manolo Blahnik
142 Ralph Lauren
143 LC Lauren Conrad
144 missing
146 Lululemon
147 Michael Kors
148 missing
Name: brand_name, Length: 121, dtype: object
#第四步:赋值
train150.loc[~train150['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
#结果:brand_name不在前多少(自己设定)pop_brand的brand_name都被设置为missing,泯然众人矣了
train150['brand_name']
Out[5]:
0 missing
1 missing
2 missing
3 missing
4 missing
5 missing
6 missing
7 missing
8 Nike
9 missing
10 missing
11 Victoria's Secret
12 missing
13 missing
14 missing
15 missing
16 missing
17 missing
18 missing
19 missing
20 missing
21 Victoria's Secret
22 missing
23 missing
24 FOREVER 21
25 Air Jordan
26 Apple
27 LuLaRoe
28 FOREVER 21
29 Hollister
...
120 missing
121 missing
122 missing
123 missing
124 missing
125 missing
126 missing
127 missing
128 missing
129 missing
130 missing
131 LuLaRoe
132 missing
133 missing
134 missing
135 missing
136 missing
137 Adidas
138 missing
139 missing
140 missing
141 missing
142 missing
143 missing
144 missing
145 Air Jordan
146 missing
147 missing
148 missing
149 Apple
Name: brand_name, Length: 150, dtype: object
stopwords = frozenset(['the', 'a', 'an', 'is', 'it', 'this', ])
stopwords
Out[8]: frozenset({'a', 'an', 'is', 'it', 'the', 'this'})
frozenset 用法详解:
https://www.programiz.com/python-programming/methods/built-in/frozenset
使用sklearn优雅地进行数据挖掘教程:http://www.cnblogs.com/jasonfreak/p/5448462.html
(教程很好,但我个人认为有一个错误,不知道我说的对不对哈,就是部分并行这个问题,作者可能是不知道有ItemSelector这个功能,所以自己实现了部分并行)
官方说明:http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html
Concatenates results of multiple transformer objects.
This estimator applies a list of transformer objects in parallel to the input data, then concatenates the results. This is useful to combine several feature extraction mechanisms into a single transformer.
Parameters of the transformers may be set using its name and the parameter name separated by a ‘__’. A transformer may be replaced entirely by setting the parameter with its name to another transformer, or removed by setting to None.
中文说明(附很有用的例子!):
http://sklearn.apachecn.org/cn/0.19.0/modules/pipeline.html
关于fit_transform和transform的详解参见:http://blog.csdn.net/anecdotegyb/article/details/74857055
提炼如下:
Q:scikit-learn中fit_transform()与transform()到底有什么区别,能不能混用?
A:二者的功能都是对数据进行某种统一处理(比如标准化~N(0,1),将数据缩放(映射)到某个固定区间,归一化,正则化等)
fit_transform(partData)对部分数据先拟合fit,找到该part的整体指标,如均值、方差、最大值最小值等等(根据具体转换的目的),然后对该partData进行转换transform,从而实现数据的标准化、归一化等等。。
根据对之前部分fit的整体指标,对剩余的数据(restData)使用同样的均值、方差、最大最小值等指标进行转换transform(restData),从而保证part、rest处理方式相同。
必须先用fit_transform(partData),之后再transform(restData)
如果直接transform(partData),程序会报错
如果fit_transfrom(partData)后,使用fit_transform(restData)而不用transform(restData),虽然也能归一化,但是两个结果不是在同一个“标准”下的,具有明显差异。
https://stats.stackexchange.com/questions/177082/sklearn-combine-multiple-feature-sets-in-pipeline
http://ju.outofmemory.cn/entry/74958