Hortonworks 用于做 Sentimental Analysis的Hiveddl.sql 文件

The hiveddl.sql script has performed the following steps to refine the data:

  • Converted the raw Twitter data into a tabular format.
  • Used the dictionary file to score the sentiment of each Tweet by the number of positive words compared to the number of negative words, and then assigned a positive, negative, or neutral sentiment value to each Tweet.
  • Created a new table that includes the sentiment value for each Tweet.

http://hortonworks.com/hadoop-tutorial/how-to-refine-and-visualize-sentiment-data/

ADD JAR json-serde-1.1.6-SNAPSHOT-jar-with-dependencies.jar;



--create the tweets_raw table containing the records as received from Twitter



CREATE EXTERNAL TABLE tweets_raw (

   id BIGINT,

   created_at STRING,

   source STRING,

   favorited BOOLEAN,

   retweet_count INT,

   retweeted_status STRUCT<

      text:STRING,

      user:STRUCT<screen_name:STRING,name:STRING>>,

   entities STRUCT<

      urls:ARRAY<STRUCT<expanded_url:STRING>>,

      user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,

      hashtags:ARRAY<STRUCT<text:STRING>>>,

   text STRING,

   user STRUCT<

      screen_name:STRING,

      name:STRING,

      friends_count:INT,

      followers_count:INT,

      statuses_count:INT,

      verified:BOOLEAN,

      utc_offset:STRING, -- was INT but nulls are strings

      time_zone:STRING>,

   in_reply_to_screen_name STRING,

   year int,

   month int,

   day int,

   hour int

)

ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'

LOCATION '/user/hue/upload/upload/data/tweets_raw'

;



-- create sentiment dictionary

CREATE EXTERNAL TABLE dictionary (

    type string,

    length int,

    word string,

    pos string,

    stemmed string,

    polarity string

)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 

STORED AS TEXTFILE

LOCATION '/user/hue/upload/upload/data/dictionary';



CREATE EXTERNAL TABLE time_zone_map (

    time_zone string,

    country string,

    notes string

)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 

STORED AS TEXTFILE

LOCATION '/user/hue/upload/upload/data/time_zone_map';



-- Clean up tweets

CREATE VIEW tweets_simple AS

SELECT

  id,

  cast ( from_unixtime( unix_timestamp(concat( '2013 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts,

  text,

  user.time_zone 

FROM tweets_raw

;



CREATE VIEW tweets_clean AS

SELECT

  id,

  ts,

  text,

  m.country 

 FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone;



-- Compute sentiment

create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words;

create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ;



-- was: select * from l2 left outer join dict d on l2.word = d.word where polarity = 'negative' limit 10;



create view l3 as select 

    id, 

    l2.word, 

    case d.polarity 

      when  'negative' then -1

      when 'positive' then 1 

      else 0 end as polarity 

 from l2 left outer join dictionary d on l2.word = d.word;

 

 create table tweets_sentiment stored as orc as select 

  id, 

  case 

    when sum( polarity ) > 0 then 'positive' 

    when sum( polarity ) < 0 then 'negative'  

    else 'neutral' end as sentiment 

 from l3 group by id;



-- put everything back together and re-number sentiment

CREATE TABLE tweetsbi 

STORED AS ORC

AS

SELECT 

  t.*,

  case s.sentiment 

    when 'positive' then 2 

    when 'neutral' then 1 

    when 'negative' then 0 

  end as sentiment  

FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;



-- for Tableau or Excel

-- UDAF sentiscore = sum(sentiment)*50  / count(sentiment)



-- context n-gram made readable

CREATE TABLE twitter_3grams

STORED AS RCFilese

AS

SELECT year, month, day, hour, snippet 

FROM

( SELECT

    year,

    month,

     day,

     hour,

     context_ngrams(sentences(lower(text)), array("iron","man","3",null,null,null), 10) ngs

  FROM tweets group by year,month,day, hour 

) base

 LATERAL VIEW

     explode(  ngs  ) ngsTab AS snippet -- ngsTab is random alias => must be there even though not used

;

你可能感兴趣的:(hive)