
Last login: Tue Aug 29 09:11:12 2017 from
[root@bdddev-agent-205 ~]# su hive
[hive@bdddev-agent-205 root]$ cd
[hive@bdddev-agent-205 ~]$ hive
2017-08-30 08:56:44,327 WARN  [main] conf.HiveConf: HiveConf of name hive.server2.enable.impersonation does not exist

Logging initialized using configuration in file:/etc/hive/
hive> use yxpt;
Time taken: 0.916 seconds
hive> describe pi_cust_item_recommend;
cust_id                 string
item_id                 bigint
advise_level            decimal(10,0)
date                    int
cust_code               varchar(30)
pack_bar                varchar(30)
ymday                   string

# Partition Information
# col_name              data_type               comment

ymday                   string
Time taken: 0.181 seconds, Fetched: 12 row(s)
hive> select * from pi_cust_item_recommend limit 3;
Time taken: 0.082 seconds


[hdfs@bdddev-agent-205 bin]$ ./pyspark
Python 2.7.5 (default, Nov  6 2016, 00:28:07)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.0

Using Python version 2.7.5 (default, Nov  6 2016 00:28:07)
SparkSession available as 'spark'.
>>> from __future__ import division
>>> from pyspark.mllib.recommendation import ALS
>>> from pyspark.sql import HiveContext
>>> from pyspark.sql import SparkSession
>>> from collections import namedtuple
>>> import math
>>> import datetime
>>> spark = SparkSession.builder.appName("bjrecommender").enableHiveSupport().getOrCreate()
>>> sc = spark.sparkContext
>>> hiveCtx = HiveContext(sc)
>>> Rating = namedtuple("Rating", ["user", "product", "rating"])
>>> tid = namedtuple('tid',['id','cust_id'])
>>> now =
>>> begin_date = (now-datetime.timedelta(days=150)).strftime('%Y%m%d')
>>> begin_date = (now-datetime.timedelta(days=200)).strftime('%Y%m%d')
>>> end_date = now.strftime('%Y%m%d')
>>> sql="select dense_rank() over(order by cust_id) id,cust_id,item_id,need_score+sold_score score from (select cust_id,item_id,qty_need,qty_sold,ntile(5) over(partition by cust_id order by qty_need) need_score,ntile(5) over(partition by cust_id order by qty_sold) sold_score from (select cust_id,item_id,sum(qty_need) qty_need,sum(qty_sold) qty_sold from yxpt.pi_cust_item_day where date1>=" +begin_date + " and date1<="+end_date+ " group by cust_id,item_id) a1) b1"
>>> total = hiveCtx.sql(sql)
>>> x : tid(str(x[0]),x[1])).distinct()
17/08/30 09:18:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
>>> id_custid.toDF().registerTempTable("id_cid")
>>> x : Rating(str(x[0]),int(x[2]),float(x[3])))
>>> ratings.toDF().show(3)
17/08/30 09:29:31 ERROR Utils: Uncaught exception in thread stdout writer for
|user| product|rating|
|   1|42010319|   2.0|
|   1|31010401|   2.0|
|   1|22240114|   2.0|
only showing top 3 rows

>>> model = ALS.train(ratings, rank=15, iterations=10,seed=0,lambda_=0.001)
17/08/30 09:34:59 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
17/08/30 09:34:59 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
17/08/30 09:34:59 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
17/08/30 09:34:59 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4644:
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4645:
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4646:
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4647:
>>> all_rating=model.recommendProductsForUsers(30).map(lambda x:x[1]).collect()
>>> len(all_rating)
>>> len(all_rating[0])
>>> userProducts = rating:(rating.user,rating.product))
>>> predictions = model.predictAll(userProducts).map(lambda rating:((rating.user,rating.product),rating.rating))
>>> type(predictions)

>>> predictions.toDF().show(3)
[Stage 258:=====================>                                   (3 + 5) / 8]17/08/30 09:44:26 WARN Executor: Managed memory leak detected; size = 15977666 bytes, TID = 4787
17/08/30 09:44:27 WARN Executor: Managed memory leak detected; size = 15977666 bytes, TID = 4788
|             _1|               _2|
only showing top 3 rows

>>> ratingsAndPredictions = rating:((int(rating.user),rating.product),rating.rating)).join(predictions)
>>> ratingsAndPredictions.toDF().show(3)
|              _1|                  _2|
| [4075,53100103]|[4.0,3.1492042815...|
| [1335,34030227]|[4.0,3.9336990908...|
only showing top 3 rows

>>> MSE = ((x,y),(m,n)):math.pow(m-n,2)).reduce(lambda x,y:x+y)/ratingsAndPredictions.count()
>>> print "***************" +str(math.sqrt(MSE)) + "*****************"
>>> k=[]
>>> for row in all_rating:
...     k.extend(row)
>>> all_rating_rdd = sc.parallelize(k)
>>> all_rating_rdd.toDF().registerTempTable("all_score")
17/08/30 10:04:58 WARN TaskSetManager: Stage 415 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
>>> hiveCtx.sql("select * from all_score limit 5").show(3)
17/08/30 10:08:19 WARN TaskSetManager: Stage 416 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
| user| product|            rating|
|27456|51520615| 35.52432167919441|
|27456|34030316| 28.08260143903327|
only showing top 3 rows

>>> hiveCtx.sql("select a2.cust_id,a1.product,rating," + end_date +" date  "+ " from all_score a1,id_cid a2 " + "where").show(5)
17/08/30 10:18:22 WARN TaskSetManager: Stage 417 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
|     cust_id| product|            rating|    date|
|110101100985|90190202| 25.33671962331747|20170830|
only showing top 5 rows

>>> hiveCtx.sql("SELECT NATION_CUST_CODE FROM yxpt.CO_CUST limit 3").show(3)
|    110114190495|
|    110115203224|
|    110108209145|

>>> hiveCtx.sql("SELECT * from id_cid limit 3").show(3)                                                                                  +-----+------------+
|   id|     cust_id|
| 4549|110105106838|

>>> hiveCtx.sql("SELECT * from id_cid limit 5").show(5)
|   id|     cust_id|
| 4549|110105106838|

>>> hiveCtx.sql("select B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID limit 3").show(3)

>>> hiveCtx.sql("select B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID limit 3 ").show(3)   +------------+

>>> hiveCtx.sql("select C.PACK_BAR "+" from yxpt.PLM_ITEM C,all_score D "+" where C.ITEM_ID=D.product limit 3").show(3)
17/08/30 15:08:24 WARN TaskSetManager: Stage 448 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
|     PACK_BAR|

>>> hiveCtx.sql("select * from all_score limit 5").show(3)
17/08/30 16:09:04 WARN TaskSetManager: Stage 465 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
| user| product|            rating|
|27456|51520615| 35.52432167919441|
|27456|34030316| 28.08260143903327|
only showing top 3 rows

>>> hiveCtx.sql("select CO_CUST_T.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST CO_CUST_T,id_cid id_cid_t "+" where CO_CUST_T.CUST_ID=id_cid_t.CUST_ID limit 3").show(3)

>>> hiveCtx.sql("select PLM_ITEM_T.PACK_BAR "+" from yxpt.PLM_ITEM PLM_ITEM_T,all_score all_score_t "+" where PLM_ITEM_T.ITEM_ID=all_score_t.product limit 3").show(3)
17/08/30 18:32:13 WARN TaskSetManager: Stage 481 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
|     PACK_BAR|




