【kaggle】Airbnb新用户的民宿预定结果预测

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline
import datetime
import os
import seaborn as sns#数据可视化
from datetime import date
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
import pickle #用于存储模型
import seaborn as sns
from sklearn.metrics import *
from sklearn.model_selection import *

train = pd.read_csv("airbnb/train_users_2.csv")
test = pd.read_csv("airbnb/test_users.csv")
print('the columns name of training dataset:\n',train.columns)
print('the columns name of test dataset:\n',test.columns)

the columns name of training dataset:
 Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'country_destination'],
      dtype='object')
the columns name of test dataset:
 Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser'],
      dtype='object')

# 分析：
# train文件比test文件多了特征-country_destination
# country_destination是需要预测的目标变量
# 数据探索时着重分析train文件，test文件类似

print(train.info())


RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB
None

#分析：
# trian文件包含213451行数据，16个特征
# 每个特征的数据类型和非空数值
# date_first_booking空值较多，在特征提取时可以考虑删除

特征分析：

#1.date_account_created
#查看前几行数据
print(train.date_account_created.head())

0    2010-06-28
1    2011-05-25
2    2010-09-28
3    2011-12-05
4    2010-09-14
Name: date_account_created, dtype: object

#对数据进行统计
print(train.date_account_created.value_counts().head())
print(train.date_account_created.value_counts().tail())

2014-05-13    674
2014-06-24    670
2014-06-25    636
2014-05-20    632
2014-05-14    622
Name: date_account_created, dtype: int64
2010-04-24    1
2010-03-09    1
2010-01-01    1
2010-06-18    1
2010-01-02    1
Name: date_account_created, dtype: int64

#获取信息
print(train.date_account_created.describe())

count         213451
unique          1634
top       2014-05-13
freq             674
Name: date_account_created, dtype: object

#观察用户增长情况
dac_train = train.date_account_created.value_counts()
dac_test = test.date_account_created.value_counts()
#将数据类型转换为datatime类型
dac_train_date = pd.to_datetime(train.date_account_created.value_counts().index)
dac_test_date = pd.to_datetime(test.date_account_created.value_counts().index)
#计算离首次注册时间相差的天数
dac_train_day = dac_train_date - dac_train_date.min()
dac_test_day = dac_test_date - dac_train_date.min()
#motplotlib作图
plt.scatter(dac_train_day.days, dac_train.values, color = 'r', label = 'train dataset')
plt.scatter(dac_test_day.days, dac_test.values, color = 'b', label = 'test dataset')

plt.title("Accounts created vs day")
plt.xlabel("Days")
plt.ylabel("Accounts created")
plt.legend(loc = 'upper left')

# 分析：
# x轴：离首次注册时间相差的天数
# y轴：当天注册的用户数量
# 随着时间的增长,用户注册的数量在急剧上升

#2.timestamp_first_active
#查看头几行数据
print(train.timestamp_first_active.head())

0    20090319043255
1    20090523174809
2    20090609231247
3    20091031060129
4    20091208061105
Name: timestamp_first_active, dtype: int64

#对数据进行统计看非重复值的数量
print(train.timestamp_first_active.value_counts().unique())

[1]

#分析： 结果[1]表明timestamp_first_active没有重复数据

#将时间戳转成日期形式并获取数据信息
tfa_train_dt = train.timestamp_first_active.astype(str).apply(lambda x:  
                                                                    datetime.datetime(int(x[:4]),
                                                                                      int(x[4:6]), 
                                                                                      int(x[6:8]), 
                                                                                      int(x[8:10]), 
                                                                                      int(x[10:12]),
                                                                                      int(x[12:])))
print(tfa_train_dt.describe())

count                  213451
unique                 213451
top       2013-07-01 05:26:34
freq                        1
first     2009-03-19 04:32:55
last      2014-06-30 23:58:24
Name: timestamp_first_active, dtype: object

#3.date_first_booking
#获取数据信息
print(train.date_first_booking.describe())
print(test.date_first_booking.describe())

count          88908
unique          1976
top       2014-05-22
freq             248
Name: date_first_booking, dtype: object
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: date_first_booking, dtype: float64

# 分析：
# train文件中date_first_booking有大量缺失值
# test文件中date_first_booking全是缺失值
# 可以删除特征date_first_booking

#age
#对数据进行统计
print(train.age.value_counts().head())

30.0    6124
31.0    6016
29.0    5963
28.0    5939
32.0    5855
Name: age, dtype: int64

#分析：用户年龄主要集中在30左右

#柱状图统计
#首先将年龄进行分成4组missing values, too small age, reasonable age, too large age
age_train =[train[train.age.isnull()].age.shape[0],
            train.query('age < 15').age.shape[0],
            train.query("age >= 15 & age <= 90").age.shape[0],
            train.query('age > 90').age.shape[0]]

age_test = [test[test.age.isnull()].age.shape[0],
            test.query('age < 15').age.shape[0],
            test.query("age >= 15 & age <= 90").age.shape[0],
            test.query('age > 90').age.shape[0]]

columns = ['Null', 'age < 15', 'age', 'age > 90']

# plot
fig, (ax1,ax2) = plt.subplots(1,2,sharex=True, sharey = True,figsize=(10,5))

sns.barplot(columns, age_train, ax = ax1)
sns.barplot(columns, age_test, ax = ax2)

ax1.set_title('training dataset')
ax2.set_title('test dataset')
ax1.set_ylabel('counts')

#分析：异常年龄较少，且有一定数量的缺失值

#其他特征
#train文件中其他特征由于labels较少，我们可以在特征工程中直接进行one hot encoding即可
#统一使用柱状图进行统计
def feature_barplot(feature, df_train = train, df_test = test, figsize=(10,5), rot = 90, saveimg = False): 
    feat_train = df_train[feature].value_counts()
    feat_test = df_test[feature].value_counts()
    fig_feature, (axis1,axis2) = plt.subplots(1,2,sharex=True, sharey = True, figsize = figsize)
    sns.barplot(feat_train.index.values, feat_train.values, ax = axis1)
    sns.barplot(feat_test.index.values, feat_test.values, ax = axis2)
    axis1.set_xticklabels(axis1.xaxis.get_majorticklabels(), rotation = rot)
    axis2.set_xticklabels(axis1.xaxis.get_majorticklabels(), rotation = rot)
    axis1.set_title(feature + ' of training dataset')
    axis2.set_title(feature + ' of test dataset')
    axis1.set_ylabel('Counts')
    plt.tight_layout()
    if saveimg == True:
        figname = feature + ".png"
        fig_feature.savefig(figname, dpi = 75)

#gender
feature_barplot('gender', saveimg = True)

#signup_method
feature_barplot('signup_method')

#signup_flow
feature_barplot('signup_flow')

#language
feature_barplot('language')

#affiliate_channel
feature_barplot('affiliate_channel')

#first_affiliate_tracked
feature_barplot('first_affiliate_tracked')

#signup_app
feature_barplot('signup_app')

#first_device_type
feature_barplot('first_device_type')

#first_browser
feature_barplot('first_browser')

##sesion文件
#获取数据并查看头10行数据
df_sessions = pd.read_csv('airbnb/sessions.csv')
df_sessions.head(10)

	user_id	action	action_type	action_detail	device_type	secs_elapsed
0	d1mm9tcy42	lookup	NaN	NaN	Windows Desktop	319.0
1	d1mm9tcy42	search_results	click	view_search_results	Windows Desktop	67753.0
2	d1mm9tcy42	lookup	NaN	NaN	Windows Desktop	301.0
3	d1mm9tcy42	search_results	click	view_search_results	Windows Desktop	22141.0
4	d1mm9tcy42	lookup	NaN	NaN	Windows Desktop	435.0
5	d1mm9tcy42	search_results	click	view_search_results	Windows Desktop	7703.0
6	d1mm9tcy42	lookup	NaN	NaN	Windows Desktop	115.0
7	d1mm9tcy42	personalize	data	wishlist_content_update	Windows Desktop	831.0
8	d1mm9tcy42	index	view	view_search_results	Windows Desktop	20842.0
9	d1mm9tcy42	lookup	NaN	NaN	Windows Desktop	683.0

#将user_id改名为id
#这是为了后面的数据合并
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1) #按行删除

df_sessions.shape

(10567737, 6)

# (10567737, 6)
# 分析：session文件有10567737行数据，6个特征

#查看缺失值
df_sessions.isnull().sum()

action             79626
action_type      1126204
action_detail    1126204
device_type            0
secs_elapsed      136031
id                 34496
dtype: int64

#分析：action，action_type，action_detail， secs_elapsed缺失值较多

#填充缺失值
df_sessions.action = df_sessions.action.fillna('NAN')
df_sessions.action_type = df_sessions.action_type.fillna('NAN')
df_sessions.action_detail = df_sessions.action_detail.fillna('NAN')
df_sessions.isnull().sum()

action                0
action_type           0
action_detail         0
device_type           0
secs_elapsed     136031
id                34496
dtype: int64

# 分析：
# 填充后缺失值已经为0了
# secs_elapsed 在后续做填充处理

特征提取

#在对数据有一定了解后，我们进行特征提取工作
#对session文件特征提取
#action
df_sessions.action.head()

0            lookup
1    search_results
2            lookup
3    search_results
4            lookup
Name: action, dtype: object

df_sessions.action.value_counts().min()

#分析：对action进行统计，我们可以发现用户action有多种，且最少的发生次数只有1，接下来我们可以对用户发生次数较少的行为列为OTHER一类

#将特征action次数低于阈值100的列为OTHER
act_freq = 100  #Threshold of frequency
act = dict(zip(*np.unique(df_sessions.action, return_counts=True)))
df_sessions.action = df_sessions.action.apply(lambda x: 'OTHER' if act[x] < act_freq else x)
#np.unique(df_sessions.action, return_counts=True) 取以数组形式返回非重复的action值和它的数量
#zip（*（a,b））a,b种元素一一对应，返回zip object

#对特征action，action_detail，action_type，device_type，secs_elapsed进行细化
# 首先将用户的特征根据用户id进行分组
# **特征action：**统计每个用户总的action出现的次数，各个action类型的数量，平均值以及标准差
# **特征action_detail：**统计每个用户总的action_detail出现的次数，各个action_detail类型的数量，平均值以及标准差
# **特征action_type：**统计每个用户总的action_type出现的次数，各个action_type类型的数量，平均值，标准差以及总的停留时长（进行log处理）
# **特征device_type：**统计每个用户总的device_type出现的次数，各个device_type类型的数量，平均值以及标准差
# **特征secs_elapsed：**对缺失值用0填充，统计每个用户secs_elapsed时间的总和，平均值，标准差以及中位数（进行log处理），（总和/平均数），secs_elapsed（log处理后）各个时间出现的次数
#对action特征进行细化
f_act = df_sessions.action.value_counts().argsort()
f_act_detail = df_sessions.action_detail.value_counts().argsort()
f_act_type = df_sessions.action_type.value_counts().argsort()
f_dev_type = df_sessions.device_type.value_counts().argsort()

#按照id进行分组
dgr_sess = df_sessions.groupby(['id'])
#Loop on dgr_sess to create all the features.
samples = [] #samples列表
ln = len(dgr_sess) #计算分组后df_sessions的长度

for g in dgr_sess:  #对dgr_sess中每个id的数据进行遍历
    gr = g[1]   #data frame that comtains all the data for a groupby value 'zzywmcn0jv'
    
    l = []  #建一个空列表，临时存放特征
    
    #the id    for example:'zzywmcn0jv'
    l.append(g[0]) #将id值放入空列表中
    
    # number of total actions
    l.append(len(gr))#将id对应数据的长度放入列表
    
    #secs_elapsed 特征中的缺失值用0填充再获取具体的停留时长值
    sev = gr.secs_elapsed.fillna(0).values   #These values are used later.
    
    #action features 特征-用户行为 
    #每个用户行为出现的次数，各个行为类型的数量，平均值以及标准差
    c_act = [0] * len(f_act)
    for i,v in enumerate(gr.action.values): #i是从0-1对应的位置，v 是用户行为特征的值
        c_act[f_act[v]] += 1
    _, c_act_uqc = np.unique(gr.action.values, return_counts=True)
    #计算用户行为行为特征各个类型数量的长度，平均值以及标准差
    c_act += [len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
    l = l + c_act
    
    #action_detail features 特征-用户行为具体
    #(how many times each value occurs, numb of unique values, mean and std)
    c_act_detail = [0] * len(f_act_detail)
    for i,v in enumerate(gr.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1
    _, c_act_det_uqc = np.unique(gr.action_detail.values, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
    l = l + c_act_detail
    
    #action_type features  特征-用户行为类型 click等
    #(how many times each value occurs, numb of unique values, mean and std
    #+ log of the sum of secs_elapsed for each value)
    l_act_type = [0] * len(f_act_type)
    c_act_type = [0] * len(f_act_type)
    for i,v in enumerate(gr.action_type.values):
        l_act_type[f_act_type[v]] += sev[i] #sev = gr.secs_elapsed.fillna(0).values ，求每个行为类型总的停留时长
        c_act_type[f_act_type[v]] += 1  
    l_act_type = np.log(1 + np.array(l_act_type)).tolist() #每个行为类型总的停留时长，差异比较大，进行log处理
    _, c_act_type_uqc = np.unique(gr.action_type.values, return_counts=True)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
    l = l + c_act_type + l_act_type    
    
    #device_type features 特征-设备类型
    #(how many times each value occurs, numb of unique values, mean and std)
    c_dev_type  = [0] * len(f_dev_type)
    for i,v in enumerate(gr.device_type .values):
        c_dev_type[f_dev_type[v]] += 1 
    c_dev_type.append(len(np.unique(gr.device_type.values))) 
    _, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts=True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]        
    l = l + c_dev_type    
    
    #secs_elapsed features  特征-停留时长     
    l_secs = [0] * 5 
    l_log = [0] * 15
    if len(sev) > 0:
        #Simple statistics about the secs_elapsed values.
        l_secs[0] = np.log(1 + np.sum(sev))
        l_secs[1] = np.log(1 + np.mean(sev)) 
        l_secs[2] = np.log(1 + np.std(sev))
        l_secs[3] = np.log(1 + np.median(sev))
        l_secs[4] = l_secs[0] / float(l[1]) #
        
        #Values are grouped in 15 intervals. Compute the number of values
        #in each interval.
        #sev = gr.secs_elapsed.fillna(0).values 
        log_sev = np.log(1 + sev).astype(int)
        #np.bincount():Count number of occurrences of each value in array of non-negative ints.  
        l_log = np.bincount(log_sev, minlength=15).tolist()                    
    l = l + l_secs + l_log
    
    #The list l has the feature values of one sample.
    samples.append(l)

#preparing objects    
samples = np.array(samples) 
samp_ar = samples[:, 1:].astype(np.float16) #取除id外的特征数据
samp_id = samples[:, 0]   #取id，id位于第一列

#为提取的特征创建一个dataframe     
col_names = []    #name of the columns
for i in range(len(samples[0])-1):  #减1的原因是因为有个id
    col_names.append('c_' + str(i))  #起名字的方式    
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id'] = samp_id
df_agg_sess.index = df_agg_sess.id #将id作为index

df_agg_sess.head()

	c_0	c_1	c_2	c_3	c_4	c_5	c_6	c_7	c_8	c_9	...	c_448	c_449	c_450	c_451	c_452	c_453	c_454	c_455	c_456	id
id
00023iyk9l	40.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	12.0	6.0	2.0	3.0	3.0	1.0	0.0	1.0	0.0	00023iyk9l
0010k6l0om	63.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	8.0	12.0	2.0	8.0	4.0	3.0	0.0	0.0	0.0	0010k6l0om
001wyh0pz8	90.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	27.0	30.0	9.0	8.0	1.0	0.0	0.0	0.0	0.0	001wyh0pz8
0028jgx1x1	31.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	2.0	3.0	5.0	4.0	1.0	0.0	0.0	0.0	0028jgx1x1
002qnbzfs5	789.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	111.0	102.0	104.0	57.0	28.0	9.0	4.0	1.0	1.0	002qnbzfs5

5 rows × 458 columns

#分析：经过特征提取后，session文件由6个特征变为458个特征

# 对trian和test文件进行特征提取

#标记train文件的行数和存储我们进行预测的目标变量
#labels存储了我们进行预测的目标变量country_destination
train = pd.read_csv("airbnb/train_users_2.csv")
test = pd.read_csv("airbnb/test_users.csv")
#计算出train的行数，便于之后对train和test数据进行分离操作
train_row = train.shape[0]  

# The label we need to predict
labels = train['country_destination'].values

#删除date_first_booking和train文件中的country_destination
# 数据探索时我们发现date_first_booking在train和test文件中缺失值太多，故删除
# 删除country_destination，用模型预测country_destination，再与已经存储country_destination的labels进行比较，从而判断模型优劣
train.drop(['country_destination', 'date_first_booking'], axis = 1, inplace = True)
test.drop(['date_first_booking'], axis = 1, inplace = True)

#合并train和test文件
#便于进行相同的特征提取操作
df = pd.concat([train, test], axis = 0, ignore_index = True)

#timestamp_first_active
#转换为datetime类型
tfa = df.timestamp_first_active.astype(str).apply(lambda x: datetime.datetime(int(x[:4]),
                                                                          int(x[4:6]), 
                                                                          int(x[6:8]),
                                                                          int(x[8:10]),
                                                                          int(x[10:12]),
                                                                          int(x[12:])))

#提取特征：年，月，日
df['tfa_year'] = np.array([x.year for x in tfa])
df['tfa_month'] = np.array([x.month for x in tfa])
df['tfa_day'] = np.array([x.day for x in tfa])

#提取特征：weekday
#对结果进行one hot encoding编码
#isoweekday() 可以返回一周的星期几，e.g.星期日：0；星期一：1
df['tfa_wd'] = np.array([x.isoweekday() for x in tfa]) 
df_tfa_wd = pd.get_dummies(df.tfa_wd, prefix = 'tfa_wd')  # one hot encoding 
df = pd.concat((df, df_tfa_wd), axis = 1) #添加df['tfa_wd'] 编码后的特征
df.drop(['tfa_wd'], axis = 1, inplace = True)#删除原有未编码的特征

#提取特征：季节
#因为判断季节关注的是月份，故对年份进行统一
Y = 2000
seasons = [(0, (date(Y,  1,  1),  date(Y,  3, 20))),  #'winter'
           (1, (date(Y,  3, 21),  date(Y,  6, 20))),  #'spring'
           (2, (date(Y,  6, 21),  date(Y,  9, 22))),  #'summer'
           (3, (date(Y,  9, 23),  date(Y, 12, 20))),  #'autumn'
           (0, (date(Y, 12, 21),  date(Y, 12, 31)))]  #'winter'

def get_season(dt):
    dt = dt.date() #获取日期
    dt = dt.replace(year=Y) #将年统一换成2000年
    return next(season for season, (start, end) in seasons if start <= dt <= end)

df['tfa_season'] = np.array([get_season(x) for x in tfa])
df_tfa_season = pd.get_dummies(df.tfa_season, prefix = 'tfa_season') # one hot encoding 
df = pd.concat((df, df_tfa_season), axis = 1)
df.drop(['tfa_season'], axis = 1, inplace = True)

# date_account_created
#将date_account_created转换为datetime类型
dac = pd.to_datetime(df.date_account_created)

# 提取特征：年，月，日
df['dac_year'] = np.array([x.year for x in dac])
df['dac_month'] = np.array([x.month for x in dac])
df['dac_day'] = np.array([x.day for x in dac])

#提取特征：weekday
df['dac_wd'] = np.array([x.isoweekday() for x in dac])
df_dac_wd = pd.get_dummies(df.dac_wd, prefix = 'dac_wd')
df = pd.concat((df, df_dac_wd), axis = 1)
df.drop(['dac_wd'], axis = 1, inplace = True)

#提取特征：季节
df['dac_season'] = np.array([get_season(x) for x in dac])
df_dac_season = pd.get_dummies(df.dac_season, prefix = 'dac_season')
df = pd.concat((df, df_dac_season), axis = 1)
df.drop(['dac_season'], axis = 1, inplace = True)

#提取特征：date_account_created和timestamp_first_active之间的差值
#即用户在airbnb平台活跃到正式注册所花的时间
dt_span = dac.subtract(tfa).dt.days

#dt_span的头十行数据
dt_span.value_counts().head(10)

#分析：数据主要集中在-1，可以猜测，用户当天注册dt_span值便是-1
# 从差值提取特征：差值为一天，一月，一年和其他
# 即用户活跃到注册花费的时间为一天，一月，一年或其他
def get_span(dt):
    # dt is an integer
    if dt == -1:
        return 'OneDay'
    elif (dt < 30) & (dt > -1):
        return 'OneMonth'
    elif (dt >= 30) & (dt <= 365):
        return 'OneYear'
    else:
        return 'other'

df['dt_span'] = np.array([get_span(x) for x in dt_span])
df_dt_span = pd.get_dummies(df.dt_span, prefix = 'dt_span')
df = pd.concat((df, df_dt_span), axis = 1)
df.drop(['dt_span'], axis = 1, inplace = True)

#删除原有的特征
#对timestamp_first_active，date_account_created进行特征提取后，从特征列表中删除原有的特征
df.drop(['date_account_created','timestamp_first_active'], axis = 1, inplace = True)

#age
av = df.age.values

#在数据探索阶段，我们发现大部分数据是集中在（15，90）区间的，但有部分年龄分布在（1900，2000）区间，
#我们猜测用户是把出生日期误填为年龄，故进行预处理
#数据来自2014年，故用2014-value
av = np.where(np.logical_and(av<2000, av>1900), 2014-av, av) 
df['age'] = av

#将年龄进行分段
age = df.age
age.fillna(-1, inplace = True) #空值填充为-1
div = 15
def get_age(age):
    # age is a float number  将连续型转换为离散型
    if age < 0:
        return 'NA' #表示是空值
    elif (age < div):
        return div #如果年龄小于15岁，那么返回15岁
    elif (age <= div * 2):
        return div*2 #如果年龄大于15小于等于30岁，则返回30岁
    elif (age <= div * 3):
        return div * 3
    elif (age <= div * 4):
        return div * 4
    elif (age <= div * 5):
        return div * 5
    elif (age <= 110):
        return div * 6
    else:
        return 'Unphysical' #非正常年龄
    
#将分段后的年龄作为新的特征放入特征列表中
df['age'] = np.array([get_age(x) for x in age])
df_age = pd.get_dummies(df.age, prefix = 'age')
df = pd.concat((df, df_age), axis = 1)
df.drop(['age'], axis = 1, inplace = True)

# 其他特征
#在数据探索时，我们发现剩余的特征lables都比较少，故不进一步进行特征提取，只进行one-hot-encoding处理
feat_toOHE = ['gender', 
             'signup_method', 
             'signup_flow', 
             'language', 
             'affiliate_channel', 
             'affiliate_provider', 
             'first_affiliate_tracked', 
             'signup_app', 
             'first_device_type', 
             'first_browser']
#对其他特征进行one-hot-encoding处理
for f in feat_toOHE:
    df_ohe = pd.get_dummies(df[f], prefix=f, dummy_na=True)
    df.drop([f], axis = 1, inplace = True)
    df = pd.concat((df, df_ohe), axis = 1)

#整合提取的所有特征
#我们将对session以及train，test文件中提取的特征进行合并
df_all = pd.merge(df, df_agg_sess, how='left')
df_all = df_all.drop(['id'], axis=1) #删除id
df_all = df_all.fillna(-2)  #对没有sesssion data的特征进行缺失值处理

#加了一列，表示每一行总共有多少空值，这也作为一个特征
df_all['all_null'] = np.array([sum(r<0) for r in df_all.values])

数据准备

#数据准备
# 将train和test数据进行分离操作
#train_row是之前记录的train数据行数
Xtrain = df_all.iloc[:train_row, :] #iloc取左闭右开的区间集合
Xtest = df_all.iloc[train_row:, :]

#将提取的特征生成csv文件
Xtrain.to_csv("airbnb/Airbnb_xtrain_v2.csv")
Xtest.to_csv("airbnb/Airbnb_xtest_v2.csv")
#labels.tofile（）：Write array to a file as text or binary (default)
labels.tofile("airbnb/Airbnb_ytrain_v2.csv", sep='\n', format='%s') #存放目标变量

#读取特征文件
xtrain = pd.read_csv("airbnb/Airbnb_xtrain_v2.csv",index_col=0)
ytrain = pd.read_csv("airbnb/Airbnb_ytrain_v2.csv", header=None)
xtrain.head()

ytrain.head()

#分析：可以发现经过特征提取后特征文件xtrain扩展为665个特征，ytrain中包含训练集中的目标变量

#将目标变量进行labels encoding
le = LabelEncoder()
ytrain_le = le.fit_transform(ytrain.values)
# labels encoding前：
# [‘AU’, ‘CA’, ‘DE’, ‘ES’, ‘FR’, ‘GB’, ‘IT’, ‘NDF’, ‘NL’, ‘PT’, ‘US’,‘other’]
# labels encoding后：
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#提取10%的数据进行模型训练
#减少训练模型花费的时间
n = int(xtrain.shape[0]*0.1)
xtrain_new = xtrain.iloc[:n, :]  #训练数据
ytrain_new = ytrain_le[:n]       #训练数据的目标变量

# StandardScaling the dataset
#Standardization of a dataset is a common requirement for many machine learning estimators: 
#they might behave badly if the individual feature do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance)
X_scaler = StandardScaler()
xtrain_new = X_scaler.fit_transform(xtrain_new)

#评分模型：NDCG
# NDCG是一种衡量排序质量的评价指标，该指标考虑了所有元素的相关性
# 由于我们预测的目标变量并不是二分类变量，故我们用NDGG模型来进行模型评分，判断模型优劣
# 一般二分类变量: 我们习惯于使用 f1 score, precision, recall, auc score来进行模型评分
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    
    """
    y_true : array, shape = [n_samples] #数据
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes] #预测的分数
        Predicted scores.
    k : int
    """
    order = np.argsort(y_score)[::-1] #分数从高到低排序
    y_true = np.take(y_true, order[:k]) #取出前k[0,k）个分数
      
    gain = 2 ** y_true - 1   

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)
  

def ndcg_score(ground_truth, predictions, k=5):   

    """
    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes] 
        Predicted probabilities. 预测的概率
    k : int
        Rank.
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)    
    scores = []
    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

构建模型

#############################Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
lr = LogisticRegression(C = 1.0, penalty='l2', multi_class='ovr')
RANDOM_STATE = 2017  #随机种子

#k-fold cross validation（k-折叠交叉验证）
kf = KFold(n_splits=5, random_state=RANDOM_STATE) #分成5个组
train_score = [] 
cv_score = []

# select a k  (value how many y):
k_ndcg = 3 
# kf.split: Generate indices to split data into training and test set.
for train_index, test_index in kf.split(xtrain_new, ytrain_new):
    #训练集数据分割为训练集和测试集，y是目标变量
    X_train, X_test = xtrain_new[train_index, :], xtrain_new[test_index, :]
    y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
        
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict_proba(X_test)
    train_ndcg_score = ndcg_score(y_train, lr.predict_proba(X_train), k = k_ndcg)
    cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)
    
    train_score.append(train_ndcg_score)
    cv_score.append(cv_ndcg_score)
    
print ("\nThe training score is: {}".format(np.mean(train_score)))
print ("\nThe cv score is: {}".format(np.mean(cv_score)))

#learning curve of logistic regression
# 观察逻辑回归模型学习曲线的变化
# 1. 改变逻辑回归参数iteration
# set the iterations
iteration = [1,5,10,15,20, 50, 100]

kf = KFold(n_splits=3, random_state=RANDOM_STATE)

train_score = []
cv_score = []

# select a k:
k_ndcg = 5

for i, item in enumerate(iteration): 

    lr = LogisticRegression(C=1.0, max_iter=item, tol=1e-5, solver='newton-cg', multi_class='ovr') 
    train_score_iter = []
    cv_score_iter = []

    for train_index, test_index in kf.split(xtrain_new, ytrain_new):
        X_train, X_test = xtrain_new[train_index, :], xtrain_new[test_index, :]
        y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
       
        lr.fit(X_train, y_train)

        y_pred = lr.predict_proba(X_test)
        train_ndcg_score = ndcg_score(y_train, lr.predict_proba(X_train), k = k_ndcg)
        cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)

        
        train_score_iter.append(train_ndcg_score)
        cv_score_iter.append(cv_ndcg_score)
        
    train_score.append(np.mean(train_score_iter))
    cv_score.append(np.mean(cv_score_iter))

ymin = np.min(cv_score)-0.05
ymax = np.max(train_score)+0.05

plt.figure(figsize=(9,4))
plt.plot(iteration, train_score, 'ro-', label = 'training')
plt.plot(iteration, cv_score, 'b*-', label = 'Cross-validation')
plt.xlabel("iterations")
plt.ylabel("Score")
plt.xlim(-5, np.max(iteration)+10)
plt.ylim(ymin, ymax)
plt.plot(np.linspace(20,20,50), np.linspace(ymin, ymax, 50), 'g--')
plt.legend(loc = 'lower right', fontsize = 12)
plt.title("Score vs iteration learning curve")

plt.tight_layout()

#分析：随着iteration的增大，逻辑回归模型的评分在不断升高，当iteration超过20的时候，模型的评分基本不变

#2.改变数据量大小
# Chaning the sampling size
# set the iter to the best iteration: iter = 20

perc = [0.01,0.02,0.05,0.1,0.2,0.5,1]

kf = KFold(n_splits=3, random_state=RANDOM_STATE)

train_score = []
cv_score = []

# select a k:
k_ndcg = 5

for i, item in enumerate(perc):
    
    lr = LogisticRegression(C=1.0, max_iter=20, tol=1e-6, solver='newton-cg', multi_class='ovr')
    train_score_iter = []
    cv_score_iter = []
    
    n = int(xtrain_new.shape[0]*item)
    xtrain_perc = xtrain_new[:n, :]
    ytrain_perc = ytrain_new[:n]


    for train_index, test_index in kf.split(xtrain_perc, ytrain_perc):
        
        X_train, X_test = xtrain_perc[train_index, :], xtrain_perc[test_index, :]
        y_train, y_test = ytrain_perc[train_index], ytrain_perc[test_index]

        print(X_train.shape, X_test.shape)
        
        lr.fit(X_train, y_train)

        y_pred = lr.predict_proba(X_test)
        train_ndcg_score = ndcg_score(y_train, lr.predict_proba(X_train), k = k_ndcg)
        cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)

        train_score_iter.append(train_ndcg_score)
        cv_score_iter.append(cv_ndcg_score)
        
    train_score.append(np.mean(train_score_iter))
    cv_score.append(np.mean(cv_score_iter))
    
ymin = np.min(cv_score)-0.1
ymax = np.max(train_score)+0.1

plt.figure(figsize=(9,4))
plt.plot(np.array(perc)*100, train_score, 'ro-', label = 'training')
plt.plot(np.array(perc)*100, cv_score, 'bo-', label = 'Cross-validation')
plt.xlabel("Sample size (unit %)")
plt.ylabel("Score")
plt.xlim(-5, np.max(perc)*100+10)
plt.ylim(ymin, ymax)

plt.legend(loc = 'lower right', fontsize = 12)
plt.title("Score vs sample size learning curve")

plt.tight_layout()

#分析：随着数据量的增加，逻辑回归模型对测试集的预测评分（蓝色线）在不断上升，因为我们在训练模型时只用了10%的数据，如果使用全部的数据，效果可能会更好

############################# 树模型
#其中的模型包括DecisionTree，RandomForest，AdaBoost，Bagging，ExtraTree，GraBoost
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import *
from sklearn.svm import SVC, LinearSVC, NuSVC
LEARNING_RATE = 0.1
N_ESTIMATORS = 50
RANDOM_STATE = 2017
MAX_DEPTH = 9

#建了一个tree字典
clf_tree ={
    'DTree': DecisionTreeClassifier(max_depth=MAX_DEPTH,
                                    random_state=RANDOM_STATE),
    
    'RF': RandomForestClassifier(n_estimators=N_ESTIMATORS,
                                 max_depth=MAX_DEPTH,
                                 random_state=RANDOM_STATE),
    
    'AdaBoost': AdaBoostClassifier(n_estimators=N_ESTIMATORS,
                                   learning_rate=LEARNING_RATE,
                                   random_state=RANDOM_STATE),
    
    'Bagging': BaggingClassifier(n_estimators=N_ESTIMATORS,
                                 random_state=RANDOM_STATE),
    
    'ExtraTree': ExtraTreesClassifier(max_depth=MAX_DEPTH,
                                      n_estimators=N_ESTIMATORS,
                                      random_state=RANDOM_STATE),
    
    'GraBoost': GradientBoostingClassifier(learning_rate=LEARNING_RATE,
                                           max_depth=MAX_DEPTH,
                                           n_estimators=N_ESTIMATORS,
                                           random_state=RANDOM_STATE)
}
train_score = []
cv_score = []

kf = KFold(n_splits=3, random_state=RANDOM_STATE)

k_ndcg = 5

for key in clf_tree.keys():
    
    clf = clf_tree.get(key)
    
    train_score_iter = []
    cv_score_iter = []

    for train_index, test_index in kf.split(xtrain_new, ytrain_new):

        X_train, X_test = xtrain_new[train_index, :], xtrain_new[test_index, :]
        y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
        
        clf.fit(X_train, y_train)

        y_pred = clf.predict_proba(X_test)
        train_ndcg_score = ndcg_score(y_train, clf.predict_proba(X_train), k = k_ndcg)
        cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)

        train_score_iter.append(train_ndcg_score)
        cv_score_iter.append(cv_ndcg_score)
        
    train_score.append(np.mean(train_score_iter))
    cv_score.append(np.mean(cv_score_iter))
    
train_score_tree = train_score
cv_score_tree = cv_score

ymin = np.min(cv_score)-0.05
ymax = np.max(train_score)+0.05

x_ticks = clf_tree.keys()

plt.figure(figsize=(8,5))
plt.plot(range(len(x_ticks)), train_score_tree, 'ro-', label = 'training')
plt.plot(range(len(x_ticks)),cv_score_tree, 'bo-', label = 'Cross-validation')

plt.xticks(range(len(x_ticks)),x_ticks,rotation = 45, fontsize = 10)
plt.xlabel("Tree method", fontsize = 12)
plt.ylabel("Score", fontsize = 12)
plt.xlim(-0.5, 5.5)
plt.ylim(ymin, ymax)

plt.legend(loc = 'best', fontsize = 12)
plt.title("Different tree methods")

plt.tight_layout()

######################################  SVM模型
#根据核函数的不同，又分为：SVM-rbf，SVM-poly，SVM-linear等
TOL = 1e-4
MAX_ITER = 1000

clf_svm = {
    
    'SVM-rbf': SVC(kernel='rbf',
                   max_iter=MAX_ITER,
                   tol=TOL, random_state=RANDOM_STATE,
                   decision_function_shape='ovr'),     
    
    'SVM-poly': SVC(kernel='poly',
                   max_iter=MAX_ITER,
                   tol=TOL, random_state=RANDOM_STATE,
                   decision_function_shape='ovr'),     
    
    'SVM-linear': SVC(kernel='linear',
                      max_iter=MAX_ITER,
                      tol=TOL, 
                      random_state=RANDOM_STATE,
                      decision_function_shape='ovr'),  
    
    'LinearSVC': LinearSVC(max_iter=MAX_ITER,
                            tol=TOL,
                            random_state=RANDOM_STATE,
                            multi_class = 'ovr')  
                            
}     

train_score_svm = []
cv_score_svm = []

kf = KFold(n_splits=3, random_state=RANDOM_STATE)

k_ndcg = 5

for key in clf_svm.keys():
    
    clf = clf_svm.get(key)

    train_score_iter = []
    cv_score_iter = []

    for train_index, test_index in kf.split(xtrain_new, ytrain_new):

        X_train, X_test = xtrain_new[train_index, :], xtrain_new[test_index, :]
        y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
        
        clf.fit(X_train, y_train)

        y_pred = clf.decision_function(X_test)
        train_ndcg_score = ndcg_score(y_train, clf.decision_function(X_train), k = k_ndcg)
        cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)

        train_score_iter.append(train_ndcg_score)
        cv_score_iter.append(cv_ndcg_score)
        
    train_score_svm.append(np.mean(train_score_iter))
    cv_score_svm.append(np.mean(cv_score_iter))
    
ymin = np.min(cv_score_svm)-0.05
ymax = np.max(train_score_svm)+0.05

x_ticks = clf_svm.keys()

plt.figure(figsize=(8,5))
plt.plot(range(len(x_ticks)), train_score_svm, 'ro-', label = 'training')
plt.plot(range(len(x_ticks)),cv_score_svm, 'bo-', label = 'Cross-validation')

plt.xticks(range(len(x_ticks)),x_ticks,rotation = 45, fontsize = 10)
plt.xlabel("Tree method", fontsize = 12)
plt.ylabel("Score", fontsize = 12)
plt.xlim(-0.5, 3.5)
plt.ylim(ymin, ymax)

plt.legend(loc = 'best', fontsize = 12)
plt.title("Different SVM methods")

plt.tight_layout()

########################################## xgboost
#aggle比赛中常用的一个模型
import xgboost as xgb

def customized_eval(preds, dtrain):
    labels = dtrain.get_label()
    top = []
    for i in range(preds.shape[0]):
        top.append(np.argsort(preds[i])[::-1][:5])
    mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
    score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
    return 'ndcg5', score
# xgboost parameters

NUM_XGB = 200

params = {}
params['colsample_bytree'] = 0.6
params['max_depth'] = 6
params['subsample'] = 0.8
params['eta'] = 0.3
params['seed'] = RANDOM_STATE
params['num_class'] = 12
params['objective'] = 'multi:softprob'   # output the probability instead of class. 
train_score_iter = []
cv_score_iter = []

kf = KFold(n_splits = 3, random_state=RANDOM_STATE)

k_ndcg = 5

for train_index, test_index in kf.split(xtrain_new, ytrain_new):

    X_train, X_test = xtrain_new[train_index, :], xtrain_new[test_index, :]
    y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
    
    train_xgb = xgb.DMatrix(X_train, label= y_train)
    test_xgb = xgb.DMatrix(X_test, label = y_test)
    
    watchlist = [ (train_xgb,'train'), (test_xgb, 'test') ]

    bst = xgb.train(params, 
                     train_xgb,
                     NUM_XGB,
                     watchlist,
                     feval = customized_eval,
                     verbose_eval = 3,
                     early_stopping_rounds = 5)
    
    
    #bst = xgb.train( params, dtrain, num_round, evallist )

    y_pred = np.array(bst.predict(test_xgb))
    y_pred_train = np.array(bst.predict(train_xgb))
    train_ndcg_score = ndcg_score(y_train, y_pred_train , k = k_ndcg)
    cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)

    train_score_iter.append(train_ndcg_score)
    cv_score_iter.append(cv_ndcg_score)

train_score_xgb = np.mean(train_score_iter)
cv_score_xgb = np.mean(cv_score_iter)

print ("\nThe training score is: {}".format(train_score_xgb))
print ("The cv score is: {}\n".format(cv_score_xgb))

模型比较

model_cvscore = np.hstack((cv_score_lr, cv_score_tree, cv_score_svm, cv_score_xgb))
model_name = np.array(['LinearReg','ExtraTree','DTree','RF','GraBoost','Bagging','AdaBoost','LinearSVC','SVM-linear','SVM-rbf','SVM-poly','Xgboost'])
fig = plt.figure(figsize=(8,4))

sns.barplot(model_cvscore, model_name, palette="Blues_d")

plt.xticks(rotation=0, size = 10)
plt.xlabel("CV score", fontsize = 12)
plt.ylabel("Model", fontsize = 12)
plt.title("Cross-validation score for different models")

plt.tight_layout()

#总结
# 对数据的理解和探索很重要
# 可以通过特征工程，进一步提取特征
# 模型评估的方法有很多种，选取适宜的模型评估方法
# 目前只用了10%的数据进行模型训练，用全部的数据集进行训练，效果可能会更好
# 需要深入学习模型算法，学会调参

你可能感兴趣的:(数据挖掘)

nosql数据库技术与应用知识点皆过客，揽星河 NoSQL nosql 数据库大数据数据分析数据结构非关系型数据库
Nosql知识回顾大数据处理流程数据采集(flume、爬虫、传感器)数据存储(本门课程NoSQL所处的阶段)Hdfs、MongoDB、HBase等数据清洗(入仓)Hive等数据处理、分析(Spark、Flink等)数据可视化数据挖掘、机器学习应用(Python、SparkMLlib等)大数据时代存储的挑战(三高)高并发(同一时间很多人访问)高扩展(要求随时根据需求扩展存储)高效率(要求读写速度快)
Python实现关联规则推荐这孩子谁懂哈 Python Machine Learning python 关联规则机器学习
1.什么关联规则关联规则（AssociationRules）是反映一个事物与其他事物之间的相互依存性和关联性，如果两个或多个事物之间存在一定的关联关系，那么，其中一个事物就能通过其他事物预测到。关联规则是数据挖掘的一个重要技术，用于从大量数据中挖掘出有价值的数据项之间的相关关系。关联规则挖掘的最经典的例子就是沃尔玛的啤酒与尿布的故事，通过对超市购物篮数据进行分析，即顾客放入购物篮中不同商品之间的关
CV、NLP、数据控掘推荐、量化海的那边- AI算法自然语言处理人工智能
下面是对CV（计算机视觉）、NLP（自然语言处理）、数据挖掘推荐和量化的简要概述及其应用领域的介绍：1.CV（计算机视觉，ComputerVision）定义：计算机视觉是一门让计算机能够从图像或视频中提取有用信息，并做出决策的学科。它通过模拟人类的视觉系统来识别、处理和理解视觉信息。主要任务：图像分类：识别图像中的物体并分类，比如猫、狗、车等。目标检测：在图像或视频中定位并识别多个对象，如人脸检测
【机器学习与R语言】1-机器学习简介苹果酱0567 面试题汇总与解析 java 中间件开发语言 spring boot 后端
1.基本概念机器学习：发明算法将数据转化为智能行为数据挖掘VS机器学习：前者侧重寻找有价值的信息，后者侧重执行已知的任务。后者是前者的先期准备过程：数据——>抽象化——>一般化。或者：收集数据——推理数据——归纳数据——发现规律抽象化：训练：用一个特定模型来拟合数据集的过程用方程来拟合观测的数据：观测现象——数据呈现——模型建立。通过不同的格式来把信息概念化一般化：一般化：将抽象化的知识转换成可用
系统架构师软考历年论文题目（2009-2024年）及分析 pccai-vip 系统架构师系统架构
时间题目20091.论基于DSSA的软件架构设计与应用；2.论信息系统建模方法；3.论基于REST服务的Web应用系统设计；4.论软件可靠性设计与应用20101.论软件的静态演化和动态演化及其应用；2.论数据挖掘技术的应用；3.论大规模分布式系统缓存设计策略；4.论软件可靠性评价20111.论模型驱动架构在系统开发中的应用；2.论企业集成平台的架构设计；3.论企业架构管理与应用；4.论软件需求获取
大数据新视界 --大数据大厂之数据挖掘入门：用 R 语言开启数据宝藏的探索之旅青云交大数据新视界数据库大数据数据挖掘 R 语言算法案例未来趋势应用场景学习建议大数据新视界
亲爱的朋友们，热烈欢迎你们来到青云交的博客！能与你们在此邂逅，我满心欢喜，深感无比荣幸。在这个瞬息万变的时代，我们每个人都在苦苦追寻一处能让心灵安然栖息的港湾。而我的博客，正是这样一个温暖美好的所在。在这里，你们不仅能够收获既富有趣味又极为实用的内容知识，还可以毫无拘束地畅所欲言，尽情分享自己独特的见解。我真诚地期待着你们的到来，愿我们能在这片小小的天地里共同成长，共同进步。本博客的精华专栏：Ja
大数据之flink与hive 星辰_mya 大数据 flink hive
其实吧我不太想写flink，因为线上经验确实不多，这也是我需要补的地方，没有条件创造条件，先来一篇吧flink：高性能低延迟流批一体的分布式计算框架基于事件时间对实时数据精准处理快速响应支持批处理，高效离线分析和数据挖掘数据仓库的引擎丰富数据源/接收器，集成多种数据存储格式和源，比较常见就是咱们今天的主题hive了checkpoint恢复机制，故障恢复快速恢复计算任务分布式弹性扩展，据业务灵活增加
纯生信很难发表？只是你没有及时抓住研究热点 SCI狂人团队
当你还做meta分析的时候，你会发现meta分析很难发或者单位已经不承认了，而聪明的人已经开始做常规的生信GEO、TCGA数据挖掘这些（这个时候生信比较好发）。当你开始做常规的生信GEO、TCGA数据挖掘的时候，你会发现这些一样也是比较难发了，而聪明的人已经开始抓免疫评分这个热点进行生信数据挖掘（这个时候免疫评分比较好发）。当你开始对免疫评分这个热点进行生信数据挖掘的时候，你会发现自己的研究方向差
K-means 算法的介绍与应用小魏冬琅 matlab 算法 kmeans 机器学习
目录引言K-means算法的基本原理表格总结：K-means算法的主要步骤K-means算法的MATLAB实现优化方法与改进K-means算法的应用领域表格总结：K-means算法的主要应用领域结论引言K-means算法是一种经典的基于距离的聚类算法，在数据挖掘、模式识别、图像处理等多个领域中得到了广泛应用。其核心思想是将相似的数据对象聚类到同一个簇中，而使得簇内对象的相似度最大、簇间的相似度最小
Matlab,Python,Java,C++的比较 Codefengfeng python java c++
Matlabmatlab是一个大型计算机，擅长矩阵计算与科学计算，适合构建模型；然而，编译软件的运行效率低，不适合大型软件开发。Pythonpython的优势是简单，入门快。适合做数据挖掘、数据分析、机器学习、人工智能、自然语言处理、爬虫、批量文件处理等，此外，Python开源免费，有很多的库，开发环境开发社区都比较友好；不过，Python是动态型的语言，需要更多的测试，并且错误仅仅是在运行的时候
如何搞定数据挖掘？这篇文章告诉你！ isNotNullX 数据挖掘人工智能
在数字化的时代，数据是我们日常生活中不可或缺的一部分。数据所蕴含的信息具有重要价值，而数据挖掘和数据分析就是解读这些信息的重要工具。本文从明晰数据概念入手，再探讨数据挖掘。一·什么是数据？数据定义：数据（Data）是指对客观事物的属性、数量、位置、关系等进行记录和描述的原始材料或信息。数据可以是数字、文字、图像、声音等多种形式，它们是信息的载体，用于表示、传递和存储信息。简单来说，数据就是观测值。
一些机器学习不错的书籍 jimmyleeee 机器学习人工智能
最近，在学习一些机器学习的相关知识，在Github上居然找到了一个可以下载一些不错的介绍机器学习和大数据挖掘和分析的书籍。具体的书籍的信息可以参考一下链接：Books/DataSciencefromScratch.pdfatmaster·varunkashyapks/Books·GitHub
使用SparkSql进行表的分析与统计 xingyuan8 大数据 java
背景我们的数据挖掘平台对数据统计有比较迫切的需求，而Spark本身对数据统计已经做了一些工作，希望梳理一下Spark已经支持的数据统计功能，后期再进行扩展。准备数据在参考文献6中下载鸢尾花数据，此处格式为iris.data格式，先将data后缀改为csv后缀（不影响使用，只是为了保证后续操作不需要修改）。数据格式如下：SepalLengthSepalWidthPetalLengthPetalWid
从零开始学python数据分析-从零开始学Python数据分析与挖掘 PDF 扫描版 weixin_37988176
给大家带来的一篇关于数据挖掘相关的电子书资源，介绍了关于Python、数据分析、数据挖掘方面的内容，本书是由清华大学出版社出版，格式为PDF，资源大小67.8MB，刘顺祥编写，目前豆瓣、亚马逊、当当、京东等电子书综合评分为：7.5。内容介绍从零开始学Python数据分析与挖掘本书以Python3版本作为数据分析与挖掘实战的应用工具，从Pyhton的基础语法开始，陆续介绍有关数值计算的Numpy、数
废字承晔儿
u额堵不堵不断进步数据挖掘额v也得分发的大跳脱衣舞一个月肚饿肚饿金额见到你的就不会预计不不会吧菊花怪下班v触宝电话代表大会素冠荷鼎厚度还是v四川饭馆有电梯的但丁地狱冬天的多点多发发动态鼎泰丰饭地方放多放房东鹅二房方圆大厦？而他得让让热厄尔热水器…
大数据分析与安全分析 Zh&&Li 网络安全运维数据分析安全数据挖掘运维数据库
大数据分析一、大数据安全威胁与需求分析1.1大数据相关概念发展大数据：是指非传统的数据处理工具的数据集大数据特征：海量的数据规模、快速的数据流转、多样的数据类型和价值密度低等大数据的种类和来源非常多，包括结构化、半结构化和非结构化数据有关大数据的新兴网络信息技术应用不断出现，主要包括大规模数据分析处理、数据挖掘、分布式文件系统、分布式数据库、云计算平台、互联网和存储系统1.2大数据安全威胁分析“数
千万级规模高性能、高并发的网络架构经验分享搬砖养女人网络架构经验分享
主题：INTO100沙龙时间：2015年11月21日下午地点：梦想加联合办公空间分享人：卫向军（毕业于北京邮电大学，现任微博平台架构师，先后在微软、金山云、新浪微博从事技术研发工作，专注于系统架构设计、音视频通讯系统、分布式文件系统和数据挖掘等领域。）架构以及我理解中架构的本质在开始谈我对架构本质的理解之前，先谈谈对今天技术沙龙主题的个人见解，千万级规模的网站感觉数量级是非常大的，对这个数量级我们
2021-01-02随笔 0清婉0
人工智能时代最重要的是机器学习，像数据分析、图像识别、数据挖掘、自然语言处理、语音识别等都是以其为基础的，也可以说人工智能的各种应用都需要机器学习来支撑。现在各大公司越来越注重数据的价值，人工成本也是越来越高，所以机器学习也就变得不可或缺了。数据分析、自然语言处理、语音识别，这将是作为前端人员的我，在2021年学习的重点。现收集几本关于数据分析的书籍，作为参考书籍学习：1.《跟着迪哥学Python
Python是什么？Python能干什么？一篇文章让你对Python了如指掌！！武昌库里写JAVA 面试题汇总与解析 spring log4j java 开发语言算法
Python作为当下最热门的编程语言，已经成为了多个领域的首选语言。能用到Python的地方非常多。从入门级小白到专业级的大佬，数据挖掘、科学计算、图像处理、人工智能，Python都可以胜任。或许是因为这种万能属性，现在有很多的小伙伴都开始学习Python。而现在Python的火爆甚至已经来到了程序员的圈子外，进入了国务院《新一代人工智能发展规划的通知》里。Python也已经走进了小学生的课程里，
BAT的大数据战略数据资本主意
实际上，大数据并不是什么新鲜事物。信息革命带来的除了信息的更高效地生产、流通和消费外，还带来数据的爆炸式增长。“引爆点”到来之后，人们发现原有的零散的对数据的利用造成了巨大的浪费。移动互联网浪潮下，数据产生速度前所未有地加快。人类达成共识开始系统性地对数据进行挖掘。这是大数据的初心。数据积累的同时，数据挖掘需要的计算理论、实时的数据收集和流通通道、数据挖掘过程需要使用的软硬件环境都在成熟。概念、模
前端数据埋点小童不学前端前端大数据
前端埋点文章目录前言一、什么是埋点二、为什么采用埋点三、前端埋点方案3.1、手动埋点3.2、可视化埋点3.3、无埋点四、埋点方式前言最近看到一个很有意思的前端数据收集：前端数据埋点，下面说说我的观点一、什么是埋点埋点，是数据采集领域，简单来说就是行为数据收集二、为什么采用埋点数据生产->数据收集->数据处理->数据分析->数据驱动/用户反馈->产品优化/迭代通过大数据处理，数据统计，数据挖掘等加工
寻找区块链行业里数字内容分发的独角兽 BBFund
时至今日，但凡对区块链有所了解的投资人都应该能看到这项技术必将给当前的内容分发行业带来彻底的改变。区块链技术的难以篡改特性适用于数字版权确权，而区块链项目的Token设计正好就是数字内容价值化的最佳解决方案。事实上互联网巨头们也都在内容分发领域奋力拼杀，但他们无非是在内容整合、数据挖掘、精准投放这些方面做文章。面对这个市场里最大的痛点：侵权、利益分配不均等问题，这些中心化的组织要么无能为力，要么自
Java在智能数据挖掘系统的应用 lizi88888 java 数据挖掘开发语言
智能数据挖掘系统是利用机器学习、统计分析等技术从大量数据中自动或半自动地发现模式和知识的系统。Java作为一种流行的编程语言，因其强大的性能和丰富的生态系统，在智能数据挖掘领域的应用非常广泛。本文将探讨Java在智能数据挖掘系统中的应用，并提供示例代码。智能数据挖掘系统概述智能数据挖掘系统通常具备以下功能：数据预处理：包括数据清洗、归一化、特征选择等。模式识别：识别数据中的模式，如分类、聚类、关联
EI会议推荐-第二届大数据与数据挖掘国际会议（BDDM 2024） shiyuankeyan 数据挖掘大数据
第二届大数据与数据挖掘国际会议（BDDM2024）1、基本信息大会官网：http://www.icbddm.org/官方邮箱：[email protected]主办方：武汉纺织大学会议时间：2024年12月13日-12月15日会议地点：湖北武汉02征稿主题：包含（但不限于）以下领域：大数据：大数据分析、人工智能、大数据网络技术、大数据搜索算法和系统、分布式和点对点搜索、基于大数据的机器学习、大数据可视化
Spark MLlib模型训练—聚类算法 K-means 不二人生 Spark ML 实战算法 spark-ml 聚类
SparkMLlib模型训练—聚类算法K-meansK-means是一种经典的聚类算法，广泛应用于数据挖掘、图像处理、推荐系统等领域。它通过将数据划分为(k)个簇（clusters），使得同一簇内的数据点尽可能相似，而不同簇之间的数据点差异尽可能大。ApacheSpark提供了K-means聚类算法的高效实现，支持大规模数据的分布式计算。本文将详细介绍K-means聚类算法的原理，并结合Spark
云计算与分布式技术-常见云的比较 NicolasLearner 服务器云服务器云主机云服务云服务器阿里云腾讯云华为云
云南大学软件学院期中报告SchoolofSoftware,YunnanUniversity个人成绩学号姓名成绩学期:2019秋季学期课程名称:云计算任课教师:陆歌皓姓名:学号：年级:完成提交时间：2019年11月4日目录SchoolofSoftware,YunnanUniversity1云计算概念2什么叫做云计算?2云计算定义及分类2根据iiMediaResearch数据挖掘和分析机构所发论文分析
数据分析利器：Java与MySQL构建强大的数据挖掘系统 lizi88888 数据挖掘数据分析 java
数据分析在当今信息时代具有重要的作用，它可以帮助企业和组织深入理解数据，发现隐藏在数据中的模式和规律，并基于这些洞察进行决策和优化。Java与MySQL作为两个强大的工具，结合起来可以构建出一个高效、可靠且功能丰富的数据挖掘系统。一、Java在数据分析中的应用1、数据处理和清洗：Java提供了丰富的数据处理和操作库，例如ApacheCommons、Jackson等，可以方便地对各种数据格式进行解析
【1】学习前言及数据分析的简单介绍&jupyter的介绍与安装烈风回响 python数据分析 python 数据分析
学习内容学习方法•重视基础•归纳总结，构建自己知识体系•推荐使用xmind思维导图•三多法则•多练习•多应用•多思考发展方向例子：•数据分析班级到课人数•有8人不来上课，这是数据分析吗？数据挖掘与数据分析区别这是现象，不是原因，所以这肯定不是数据分析。若是班主任的业务能力比较强，他对每个同学的上课情况都十分了解可能有五个同学一直加班，比较忙所以没有来上课，还有两个是因为跟不上了，还有一个在谈对象。
GNN会议&期刊汇总（人工智能、机器学习、深度学习、数据挖掘） Bunny_Ben 科研方法&心得人工智能机器学习深度学习笔记神经网络数据挖掘
会议【NeurIPS】全称ConferenceonNeuralInformationProcessingSystems（神经信息处理系统大会），机器学习和计算神经科学领域的顶级学术会议，CCFA。【ICLR】全称InternationalConferenceonLearningRepresentations（国际学习表征会议），深度学习顶会。【AAAI】由人工智能促进协会AAAI（Associat
【统计分析与数据挖掘】基本统计分析方法与数据挖掘技术爱技术的小伙子数据挖掘人工智能
统计分析与数据挖掘基本统计分析方法与数据挖掘技术引言在数据驱动的时代，统计分析与数据挖掘是从大量数据中提取有价值信息的核心技术。统计分析通过数学模型描述和理解数据的特征，而数据挖掘则通过算法自动发现数据中的模式和关系。本文将探讨基本的统计分析方法和常用的数据挖掘技术，帮助读者更好地理解和应用这些工具。1.统计分析概述1.1统计分析的基本概念统计分析是一种利用数据来进行推断和预测的方法。它包括描述性
tomcat基础与部署发布暗黑小菠萝 Tomcat java web
从51cto搬家了，以后会更新在这里方便自己查看。做项目一直用tomcat，都是配置到eclipse中使用，这几天有时间整理一下使用心得，有一些自己配置遇到的细节问题。 Tomcat：一个Servlets和JSP页面的容器，以提供网站服务。一、Tomcat安装安装方式：①运行.exe安装包 &n
网站架构发展的过程 ayaoxinchao 数据库应用服务器网站架构
1.初始阶段网站架构：应用程序、数据库、文件等资源在同一个服务器上 2.应用服务和数据服务分离：应用服务器、数据库服务器、文件服务器 3.使用缓存改善网站性能：为应用服务器提供本地缓存，但受限于应用服务器的内存容量，可以使用专门的缓存服务器，提供分布式缓存服务器架构 4.使用应用服务器集群改善网站的并发处理能力：使用负载均衡调度服务器，将来自客户端浏览器的访问请求分发到应用服务器集群中的任何
[信息与安全]数据库的备份问题 comsci 数据库
如果你们建设的信息系统是采用中心-分支的模式,那么这里有一个问题如果你的数据来自中心数据库,那么中心数据库如果出现故障,你的分支机构的数据如何保证安全呢? 是否应该在这种信息系统结构的基础上进行改造,容许分支机构的信息系统也备份一个中心数据库的文件呢? &n
使用maven tomcat plugin插件debug关联源代码商人shang maven debug 查看源码 tomcat-plugin
*首先需要配置好'''maven-tomcat7-plugin'''，参见[[Maven开发Web项目]]的'''Tomcat'''部分。 *配置好后，在[[Eclipse]]中打开'''Debug Configurations'''界面，在'''Maven Build'''项下新建当前工程的调试。在'''Main'''选项卡中点击'''Browse Workspace...'''选择需要开发的
大访问量高并发 oloz 大访问量高并发
大访问量高并发的网站主要压力还是在于数据库的操作上，尽量避免频繁的请求数据库。下面简要列出几点解决方案： 01、优化你的代码和查询语句，合理使用索引 02、使用缓存技术例如memcache、ecache将不经常变化的数据放入缓存之中 03、采用服务器集群、负载均衡分担大访问量高并发压力 04、数据读写分离 05、合理选用框架，合理架构(推荐分布式架构)。
cache 服务器小猪猪08 cache
Cache 即高速缓存.那么cache是怎么样提高系统性能与运行速度呢？是不是在任何情况下用cache都能提高性能？是不是cache用的越多就越好呢？我在近期开发的项目中有所体会，写下来当作总结也希望能跟大家一起探讨探讨，有错误的地方希望大家批评指正。　　1.Cache 是怎么样工作的? 　　Cache 是分配在服务器上
mysql存储过程香水浓 mysql
Description:插入大量测试数据 use xmpl; drop procedure if exists mockup_test_data_sp; create procedure mockup_test_data_sp( in number_of_records int ) begin declare cnt int; declare name varch
CSS的class、id、css文件名的常用命名规则 agevs JavaScript UI 框架 Ajax css
CSS的class、id、css文件名的常用命名规则 (一)常用的CSS命名规则　　头：header 　　内容：content/container 　　尾：footer 　　导航：nav 　　侧栏：sidebar 　　栏目：column 　　页面外围控制整体布局宽度：wrapper 　　左右中：left right
全局数据源 AILIKES java tomcat mysql jdbc JNDI
实验目的：为了研究两个项目同时访问一个全局数据源的时候是创建了一个数据源对象，还是创建了两个数据源对象。 1：将diuid和mysql驱动包（druid-1.0.2.jar和mysql-connector-java-5.1.15.jar）copy至%TOMCAT_HOME%/lib下；2：配置数据源，将JNDI在%TOMCAT_HOME%/conf/context.xml中配置好,格式如下：&l
MYSQL的随机查询的实现方法 baalwolf mysql
MYSQL的随机抽取实现方法。举个例子，要从tablename表中随机提取一条记录，大家一般的写法就是：SELECT * FROM tablename ORDER BY RAND() LIMIT 1。但是，后来我查了一下MYSQL的官方手册，里面针对RAND()的提示大概意思就是，在ORDER BY从句里面不能使用RAND()函数，因为这样会导致数据列被多次扫描。但是在MYSQL 3.23版本中，
JAVA的getBytes()方法 bijian1013 java eclipse unix OS
在Java中，String的getBytes()方法是得到一个操作系统默认的编码格式的字节数组。这个表示在不同OS下，返回的东西不一样！ String.getBytes(String decode)方法会根据指定的decode编码返回某字符串在该编码下的byte数组表示，如： byte[] b_gbk = "
AngularJS中操作Cookies bijian1013 JavaScript AngularJS Cookies
如果你的应用足够大、足够复杂，那么你很快就会遇到这样一咱种情况：你需要在客户端存储一些状态信息，这些状态信息是跨session(会话)的。你可能还记得利用document.cookie接口直接操作纯文本cookie的痛苦经历。幸运的是，这种方式已经一去不复返了，在所有现代浏览器中几乎
[Maven学习笔记五]Maven聚合和继承特性 bit1129 maven
Maven聚合在实际的项目中，一个项目通常会划分为多个模块，为了说明问题，以用户登陆这个小web应用为例。通常一个web应用分为三个模块： 1. 模型和数据持久化层user-core, 2. 业务逻辑层user-service以 3. web展现层user-web， user-service依赖于user-core user-web依赖于user-core和use
【JVM七】JVM知识点总结 bit1129 jvm
1. JVM运行模式 1.1 JVM运行时分为-server和-client两种模式，在32位机器上只有client模式的JVM。通常，64位的JVM默认都是使用server模式，因为server模式的JVM虽然启动慢点，但是，在运行过程，JVM会尽可能的进行优化 1.2 JVM分为三种字节码解释执行方式：mixed mode, interpret mode以及compiler
linux下查看nginx、apache、mysql、php的编译参数 ronin47
在linux平台下的应用，最流行的莫过于nginx、apache、mysql、php几个。而这几个常用的应用，在手工编译完以后，在其他一些情况下（如：新增模块），往往想要查看当初都使用了那些参数进行的编译。这时候就可以利用以下方法查看。 1、nginx [root@361way ~]# /App/nginx/sbin/nginx -V nginx: nginx version: nginx/
unity中运用Resources.Load的方法？ brotherlamp unity视频 unity资料 unity自学 unity unity教程
问：unity中运用Resources.Load的方法？答：Resources.Load是unity本地动态加载资本所用的方法,也即是你想动态加载的时分才用到它,比方枪弹,特效,某些实时替换的图像什么的,主张此文件夹不要放太多东西,在打包的时分,它会独自把里边的一切东西都会集打包到一同,不论里边有没有你用的东西,所以大多数资本应该是自个建文件放置 1、unity实时替换的物体即是依据环境条件
线段树-入门 bylijinnan java 算法线段树
/** * 线段树入门 * 问题：已知线段[2,5] [4,6] [0,7]；求点2,4,7分别出现了多少次 * 以下代码建立的线段树用链表来保存，且树的叶子结点类似[i,i] * * 参考链接：http://hi.baidu.com/semluhiigubbqvq/item/be736a33a8864789f4e4ad18 * @author lijinna
全选与反选 chicony 全选
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>全选与反选</title>
vim一些简单记录 chenchao051 vim
mac在/usr/share/vim/vimrc linux在/etc/vimrc 1、问：后退键不能删除数据，不能往后退怎么办？答：在vimrc中加入set backspace=2 2、问：如何控制tab键的缩进？答：在vimrc中加入set tabstop=4 (任何
Sublime Text 快捷键 daizj 快捷键 sublime
[size=large][/size]Sublime Text快捷键：Ctrl+Shift+P：打开命令面板Ctrl+P：搜索项目中的文件Ctrl+G：跳转到第几行Ctrl+W：关闭当前打开文件Ctrl+Shift+W：关闭所有打开文件Ctrl+Shift+V：粘贴并格式化Ctrl+D：选择单词，重复可增加选择下一个相同的单词Ctrl+L：选择行，重复可依次增加选择下一行Ctrl+Shift+L：
php 引用(&)详解 dcj3sjt126com PHP
在PHP 中引用的意思是：不同的名字访问同一个变量内容. 与Ｃ语言中的指针是有差别的．Ｃ语言中的指针里面存储的是变量的内容在内存中存放的地址变量的引用 PHP 的引用允许你用两个变量来指向同一个内容复制代码代码如下: <? $a="ABC"; $b =&$a; echo
SVN中trunk,branches,tags用法详解 dcj3sjt126com SVN
Subversion有一个很标准的目录结构，是这样的。比如项目是proj，svn地址为svn://proj/，那么标准的svn布局是svn://proj/|+-trunk+-branches+-tags这是一个标准的布局，trunk为主开发目录，branches为分支开发目录，tags为tag存档目录（不允许修改）。但是具体这几个目录应该如何使用，svn并没有明确的规范，更多的还是用户自己的习惯。
对软件设计的思考 e200702084 设计模式数据结构算法 ssh 活动
软件设计的宏观与微观软件开发是一种高智商的开发活动。一个优秀的软件设计人员不仅要从宏观上把握软件之间的开发，也要从微观上把握软件之间的开发。宏观上，可以应用面向对象设计，采用流行的SSH架构，采用web层，业务逻辑层，持久层分层架构。采用设计模式提供系统的健壮性和可维护性。微观上，对于一个类，甚至方法的调用，从计算机的角度模拟程序的运行情况。了解内存分配，参数传
同步、异步、阻塞、非阻塞 geeksun 非阻塞
同步、异步、阻塞、非阻塞这几个概念有时有点混淆，在此文试图解释一下。同步：发出方法调用后，当没有返回结果，当前线程会一直在等待（阻塞）状态。场景：打电话，营业厅窗口办业务、B/S架构的http请求-响应模式。异步：方法调用后不立即返回结果，调用结果通过状态、通知或回调通知方法调用者或接收者。异步方法调用后，当前线程不会阻塞，会继续执行其他任务。实现：
Reverse SSH Tunnel 反向打洞實錄 hongtoushizi ssh
實際的操作步驟： # 首先，在客戶那理的機器下指令連回我們自己的 Server，並設定自己 Server 上的 12345 port 會對應到幾器上的 SSH port ssh -NfR 12345:localhost:22 [email protected] # 然後在 myhost 的機器上連自己的 12345 port，就可以連回在客戶那的機器 ssh localhost -p 1
Hibernate中的缓存 Josh_Persistence 一级缓存 Hiberante缓存查询缓存二级缓存
Hibernate中的缓存一、Hiberante中常见的三大缓存：一级缓存，二级缓存和查询缓存。 Hibernate中提供了两级Cache，第一级别的缓存是Session级别的缓存，它是属于事务范围的缓存。这一级别的缓存是由hibernate管理的，一般情况下无需进行干预；第二级别的缓存是SessionFactory级别的缓存，它是属于进程范围或群集范围的缓存。这一级别的缓存
对象关系行为模式之延迟加载 home198979 PHP 架构延迟加载
形象化设计模式实战 HELLO!架构一、概念 Lazy Load：一个对象，它虽然不包含所需要的所有数据，但是知道怎么获取这些数据。延迟加载貌似很简单，就是在数据需要时再从数据库获取，减少数据库的消耗。但这其中还是有不少技巧的。二、实现延迟加载实现Lazy Load主要有四种方法：延迟初始化、虚
xml 验证 pengfeicao521 xml xml解析
有些字符，xml不能识别，用jdom或者dom4j解析的时候就报错 public static void testPattern() { // 含有非法字符的串 String str = "Jamey친Ñ&#1282
div设置半透明效果 spjich css 半透明
为div设置如下样式： div{filter:alpha(Opacity=80);-moz-opacity:0.5;opacity: 0.5;} 说明： 1、filter：对win IE设置半透明滤镜效果，filter:alpha(Opacity=80)代表该对象80%半透明，火狐浏览器不认2、-moz-opaci
你真的了解单例模式么？ w574240966 java 单例设计模式 jvm
单例模式，很多初学者认为单例模式很简单，并且认为自己已经掌握了这种设计模式。但事实上，你真的了解单例模式了么。一，单例模式的5中写法。（回字的四种写法，哈哈。） 1，懒汉式（1）线程不安全的懒汉式 public cla