import tensorflow as tf
imdb=tf.keras.datasets.imdb
(train_x, train_y), (test_x, test_y)=imdb.load_data(num_words = 10000)
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 21s 1us/step
参数num_words = 10000的意思是仅保留训练数据的前10000个最常见出现的单词,低频单词将被舍弃。这样得到的向量数据不会太大,便于处理。
数据集介绍:
IMDB数据集包含来自互联网的50000条严重两极分化的评论,该数据被分为用于训练的25000条评论和用于测试的25000条评论,训练集和测试集都包含50%的正面评价和50%的负面评价。该数据集已经经过预处理:评论(单词序列)已经被转换为整数序列,其中每个整数代表字典中的某个单词。
train_x.shape
(25000,)
test_x.shape
(25000,)
print(train_x[0])
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
例如一句话:str_word = “stay strong Wuhan!stay strong China!”
1.去除空格,统计不重复词到列表:word_list = [“stay”,“strong”,“Wuhan”,"!",“China”],一共5个词
2.word_list列表中每个词用对应的下标索引表示:word_dic = {“stay”:0,“strong”:1,“Wuhan”:2,"!":3,“China”:4}
3.将原来句子中每个词用对应的下标索引(id)表示,句子转为整数序列:int_word = [0,1,2,3,0,1,4,3]
将词用向量进行表示,向量的维度为词向量维度,假设词向量维度为3。通过在词向量表中查表,找出词下标所映射的词向量。例如“!”下标为3,则找出的词向量为[10,12,19]
tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=word_dimension, input_length=maxlen)
输入句子单词的整数表示,输出词映射为向量的句子
Input shape:(batch_size, input_length)
Output shape:(batch_size, input_length, output_dim)
参数说明:
input_dim:词的总个数
output_dim:词嵌入的维度
input_length:当输入序列的长度固定时,该值为其长度。如果要在该层后接Flatten层,然后接Dense层,则必须指定该参数,否则Dense层的输出维度无法自动推断。
data = np.array([[0,1,2],[2,1,1]])
emb = tf.keras.layers.Embedding(input_dim=3, output_dim=3, input_length=5)
emb(data)
emb.variables
[]
查看数据集词和id的映射关系的映射关系,词表
word_index = imdb.get_word_index()
word_index
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 1s 0us/step
{'fawn': 34701,
'tsukino': 52006,
'nunnery': 52007,
'sonja': 16816,
'vani': 63951,
'woods': 1408,
'spiders': 16115,
'hanging': 2345,
'woody': 2289,
'trawling': 52008,
"hold's": 52009,
'comically': 11307,
'localized': 40830,
'disobeying': 30568,
"'royale": 52010,
"harpo's": 40831,
'canet': 52011,
'aileen': 19313,
'acurately': 52012,
"diplomat's": 52013,
'rickman': 25242,
'arranged': 6746,
'rumbustious': 52014,
'familiarness': 52015,
"spider'": 52016,
'hahahah': 68804,
"wood'": 52017,
'transvestism': 40833,
"hangin'": 34702,
'bringing': 2338,
'seamier': 40834,
'wooded': 34703,
'bravora': 52018,
'grueling': 16817,
'wooden': 1636,
'wednesday': 16818,
"'prix": 52019,
'altagracia': 34704,
'circuitry': 52020,
'crotch': 11585,
'busybody': 57766,
"tart'n'tangy": 52021,
'burgade': 14129,
'thrace': 52023,
"tom's": 11038,
'snuggles': 52025,
'francesco': 29114,
'complainers': 52027,
'templarios': 52125,
'272': 40835,
'273': 52028,
'zaniacs': 52130,
'275': 34706,
'consenting': 27631,
'snuggled': 40836,
'inanimate': 15492,
'uality': 52030,
'bronte': 11926,
'errors': 4010,
'dialogs': 3230,
"yomada's": 52031,
"madman's": 34707,
'dialoge': 30585,
'usenet': 52033,
'videodrome': 40837,
"kid'": 26338,
'pawed': 52034,
"'girlfriend'": 30569,
"'pleasure": 52035,
"'reloaded'": 52036,
"kazakos'": 40839,
'rocque': 52037,
'mailings': 52038,
'brainwashed': 11927,
'mcanally': 16819,
"tom''": 52039,
'kurupt': 25243,
'affiliated': 21905,
'babaganoosh': 52040,
"noe's": 40840,
'quart': 40841,
'kids': 359,
'uplifting': 5034,
'controversy': 7093,
'kida': 21906,
'kidd': 23379,
"error'": 52041,
'neurologist': 52042,
'spotty': 18510,
'cobblers': 30570,
'projection': 9878,
'fastforwarding': 40842,
'sters': 52043,
"eggar's": 52044,
'etherything': 52045,
'gateshead': 40843,
'airball': 34708,
'unsinkable': 25244,
'stern': 7180,
"cervi's": 52046,
'dnd': 40844,
'dna': 11586,
'insecurity': 20598,
"'reboot'": 52047,
'trelkovsky': 11037,
'jaekel': 52048,
'sidebars': 52049,
"sforza's": 52050,
'distortions': 17633,
'mutinies': 52051,
'sermons': 30602,
'7ft': 40846,
'boobage': 52052,
"o'bannon's": 52053,
'populations': 23380,
'chulak': 52054,
'mesmerize': 27633,
'quinnell': 52055,
'yahoo': 10307,
'meteorologist': 52057,
'beswick': 42577,
'boorman': 15493,
'voicework': 40847,
"ster'": 52058,
'blustering': 22922,
'hj': 52059,
'intake': 27634,
'morally': 5621,
'jumbling': 40849,
'bowersock': 52060,
"'porky's'": 52061,
'gershon': 16821,
'ludicrosity': 40850,
'coprophilia': 52062,
'expressively': 40851,
"india's": 19500,
"post's": 34710,
'wana': 52063,
'wang': 5283,
'wand': 30571,
'wane': 25245,
'edgeways': 52321,
'titanium': 34711,
'pinta': 40852,
'want': 178,
'pinto': 30572,
'whoopdedoodles': 52065,
'tchaikovsky': 21908,
'travel': 2103,
"'victory'": 52066,
'copious': 11928,
'gouge': 22433,
"chapters'": 52067,
'barbra': 6702,
'uselessness': 30573,
"wan'": 52068,
'assimilated': 27635,
'petiot': 16116,
'most\x85and': 52069,
'dinosaurs': 3930,
'wrong': 352,
'seda': 52070,
'stollen': 52071,
'sentencing': 34712,
'ouroboros': 40853,
'assimilates': 40854,
'colorfully': 40855,
'glenne': 27636,
'dongen': 52072,
'subplots': 4760,
'kiloton': 52073,
'chandon': 23381,
"effect'": 34713,
'snugly': 27637,
'kuei': 40856,
'welcomed': 9092,
'dishonor': 30071,
'concurrence': 52075,
'stoicism': 23382,
"guys'": 14896,
"beroemd'": 52077,
'butcher': 6703,
"melfi's": 40857,
'aargh': 30623,
'playhouse': 20599,
'wickedly': 11308,
'fit': 1180,
'labratory': 52078,
'lifeline': 40859,
'screaming': 1927,
'fix': 4287,
'cineliterate': 52079,
'fic': 52080,
'fia': 52081,
'fig': 34714,
'fmvs': 52082,
'fie': 52083,
'reentered': 52084,
'fin': 30574,
'doctresses': 52085,
'fil': 52086,
'zucker': 12606,
'ached': 31931,
'counsil': 52088,
'paterfamilias': 52089,
'songwriter': 13885,
'shivam': 34715,
'hurting': 9654,
'effects': 299,
'slauther': 52090,
"'flame'": 52091,
'sommerset': 52092,
'interwhined': 52093,
'whacking': 27638,
'bartok': 52094,
'barton': 8775,
'frewer': 21909,
"fi'": 52095,
'ingrid': 6192,
'stribor': 30575,
'approporiately': 52096,
'wobblyhand': 52097,
'tantalisingly': 52098,
'ankylosaurus': 52099,
'parasites': 17634,
'childen': 52100,
"jenkins'": 52101,
'metafiction': 52102,
'golem': 17635,
'indiscretion': 40860,
"reeves'": 23383,
"inamorata's": 57781,
'brittannica': 52104,
'adapt': 7916,
"russo's": 30576,
'guitarists': 48246,
'abbott': 10553,
'abbots': 40861,
'lanisha': 17649,
'magickal': 40863,
'mattter': 52105,
"'willy": 52106,
'pumpkins': 34716,
'stuntpeople': 52107,
'estimate': 30577,
'ugghhh': 40864,
'gameplay': 11309,
"wern't": 52108,
"n'sync": 40865,
'sickeningly': 16117,
'chiara': 40866,
'disturbed': 4011,
'portmanteau': 40867,
'ineffectively': 52109,
"duchonvey's": 82143,
"nasty'": 37519,
'purpose': 1285,
'lazers': 52112,
'lightened': 28105,
'kaliganj': 52113,
'popularism': 52114,
"damme's": 18511,
'stylistics': 30578,
'mindgaming': 52115,
'spoilerish': 46449,
"'corny'": 52117,
'boerner': 34718,
'olds': 6792,
'bakelite': 52118,
'renovated': 27639,
'forrester': 27640,
"lumiere's": 52119,
'gaskets': 52024,
'needed': 884,
'smight': 34719,
'master': 1297,
"edie's": 25905,
'seeber': 40868,
'hiya': 52120,
'fuzziness': 52121,
'genesis': 14897,
'rewards': 12607,
'enthrall': 30579,
"'about": 40869,
"recollection's": 52122,
'mutilated': 11039,
'fatherlands': 52123,
"fischer's": 52124,
'positively': 5399,
'270': 34705,
'ahmed': 34720,
'zatoichi': 9836,
'bannister': 13886,
'anniversaries': 52127,
"helm's": 30580,
"'work'": 52128,
'exclaimed': 34721,
"'unfunny'": 52129,
'274': 52029,
'feeling': 544,
"wanda's": 52131,
'dolan': 33266,
'278': 52133,
'peacoat': 52134,
'brawny': 40870,
'mishra': 40871,
'worlders': 40872,
'protags': 52135,
'skullcap': 52136,
'dastagir': 57596,
'affairs': 5622,
'wholesome': 7799,
'hymen': 52137,
'paramedics': 25246,
'unpersons': 52138,
'heavyarms': 52139,
'affaire': 52140,
'coulisses': 52141,
'hymer': 40873,
'kremlin': 52142,
'shipments': 30581,
'pixilated': 52143,
"'00s": 30582,
'diminishing': 18512,
'cinematic': 1357,
'resonates': 14898,
'simplify': 40874,
"nature'": 40875,
'temptresses': 40876,
'reverence': 16822,
'resonated': 19502,
'dailey': 34722,
'2\x85': 52144,
'treize': 27641,
'majo': 52145,
'kiya': 21910,
'woolnough': 52146,
'thanatos': 39797,
'sandoval': 35731,
'dorama': 40879,
"o'shaughnessy": 52147,
'tech': 4988,
'fugitives': 32018,
'teck': 30583,
"'e'": 76125,
'doesn’t': 40881,
'purged': 52149,
'saying': 657,
"martians'": 41095,
'norliss': 23418,
'dickey': 27642,
'dicker': 52152,
"'sependipity": 52153,
'padded': 8422,
'ordell': 57792,
"sturges'": 40882,
'independentcritics': 52154,
'tempted': 5745,
"atkinson's": 34724,
'hounded': 25247,
'apace': 52155,
'clicked': 15494,
"'humor'": 30584,
"martino's": 17177,
"'supporting": 52156,
'warmongering': 52032,
"zemeckis's": 34725,
'lube': 21911,
'shocky': 52157,
'plate': 7476,
'plata': 40883,
'sturgess': 40884,
"nerds'": 40885,
'plato': 20600,
'plath': 34726,
'platt': 40886,
'mcnab': 52159,
'clumsiness': 27643,
'altogether': 3899,
'massacring': 42584,
'bicenntinial': 52160,
'skaal': 40887,
'droning': 14360,
'lds': 8776,
'jaguar': 21912,
"cale's": 34727,
'nicely': 1777,
'mummy': 4588,
"lot's": 18513,
'patch': 10086,
'kerkhof': 50202,
"leader's": 52161,
"'movie": 27644,
'uncomfirmed': 52162,
'heirloom': 40888,
'wrangle': 47360,
'emotion\x85': 52163,
"'stargate'": 52164,
'pinoy': 40889,
'conchatta': 40890,
'broeke': 41128,
'advisedly': 40891,
"barker's": 17636,
'descours': 52166,
'lots': 772,
'lotr': 9259,
'irs': 9879,
'lott': 52167,
'xvi': 40892,
'irk': 34728,
'irl': 52168,
'ira': 6887,
'belzer': 21913,
'irc': 52169,
'ire': 27645,
'requisites': 40893,
'discipline': 7693,
'lyoko': 52961,
'extend': 11310,
'nature': 873,
"'dickie'": 52170,
'optimist': 40894,
'lapping': 30586,
'superficial': 3900,
'vestment': 52171,
'extent': 2823,
'tendons': 52172,
"heller's": 52173,
'quagmires': 52174,
'miyako': 52175,
'moocow': 20601,
"coles'": 52176,
'lookit': 40895,
'ravenously': 52177,
'levitating': 40896,
'perfunctorily': 52178,
'lookin': 30587,
"lot'": 40898,
'lookie': 52179,
'fearlessly': 34870,
'libyan': 52181,
'fondles': 40899,
'gopher': 35714,
'wearying': 40901,
"nz's": 52182,
'minuses': 27646,
'puposelessly': 52183,
'shandling': 52184,
'decapitates': 31268,
'humming': 11929,
"'nother": 40902,
'smackdown': 21914,
'underdone': 30588,
'frf': 40903,
'triviality': 52185,
'fro': 25248,
'bothers': 8777,
"'kensington": 52186,
'much': 73,
'muco': 34730,
'wiseguy': 22615,
"richie's": 27648,
'tonino': 40904,
'unleavened': 52187,
'fry': 11587,
"'tv'": 40905,
'toning': 40906,
'obese': 14361,
'sensationalized': 30589,
'spiv': 40907,
'spit': 6259,
'arkin': 7364,
'charleton': 21915,
'jeon': 16823,
'boardroom': 21916,
'doubts': 4989,
'spin': 3084,
'hepo': 53083,
'wildcat': 27649,
'venoms': 10584,
'misconstrues': 52191,
'mesmerising': 18514,
'misconstrued': 40908,
'rescinds': 52192,
'prostrate': 52193,
'majid': 40909,
'climbed': 16479,
'canoeing': 34731,
'majin': 52195,
'animie': 57804,
'sylke': 40910,
'conditioned': 14899,
'waddell': 40911,
'3\x85': 52196,
'hyperdrive': 41188,
'conditioner': 34732,
'bricklayer': 53153,
'hong': 2576,
'memoriam': 52198,
'inventively': 30592,
"levant's": 25249,
'portobello': 20638,
'remand': 52200,
'mummified': 19504,
'honk': 27650,
'spews': 19505,
'visitations': 40912,
'mummifies': 52201,
'cavanaugh': 25250,
'zeon': 23385,
"jungle's": 40913,
'viertel': 34733,
'frenchmen': 27651,
'torpedoes': 52202,
'schlessinger': 52203,
'torpedoed': 34734,
'blister': 69876,
'cinefest': 52204,
'furlough': 34735,
'mainsequence': 52205,
'mentors': 40914,
'academic': 9094,
'stillness': 20602,
'academia': 40915,
'lonelier': 52206,
'nibby': 52207,
"losers'": 52208,
'cineastes': 40916,
'corporate': 4449,
'massaging': 40917,
'bellow': 30593,
'absurdities': 19506,
'expetations': 53241,
'nyfiken': 40918,
'mehras': 75638,
'lasse': 52209,
'visability': 52210,
'militarily': 33946,
"elder'": 52211,
'gainsbourg': 19023,
'hah': 20603,
'hai': 13420,
'haj': 34736,
'hak': 25251,
'hal': 4311,
'ham': 4892,
'duffer': 53259,
'haa': 52213,
'had': 66,
'advancement': 11930,
'hag': 16825,
"hand'": 25252,
'hay': 13421,
'mcnamara': 20604,
"mozart's": 52214,
'duffel': 30731,
'haq': 30594,
'har': 13887,
'has': 44,
'hat': 2401,
'hav': 40919,
'haw': 30595,
'figtings': 52215,
'elders': 15495,
'underpanted': 52216,
'pninson': 52217,
'unequivocally': 27652,
"barbara's": 23673,
"bello'": 52219,
'indicative': 12997,
'yawnfest': 40920,
'hexploitation': 52220,
"loder's": 52221,
'sleuthing': 27653,
"justin's": 32622,
"'ball": 52222,
"'summer": 52223,
"'demons'": 34935,
"mormon's": 52225,
"laughton's": 34737,
'debell': 52226,
'shipyard': 39724,
'unabashedly': 30597,
'disks': 40401,
'crowd': 2290,
'crowe': 10087,
"vancouver's": 56434,
'mosques': 34738,
'crown': 6627,
'culpas': 52227,
'crows': 27654,
'surrell': 53344,
'flowless': 52229,
'sheirk': 52230,
"'three": 40923,
"peterson'": 52231,
'ooverall': 52232,
'perchance': 40924,
'bottom': 1321,
'chabert': 53363,
'sneha': 52233,
'inhuman': 13888,
'ichii': 52234,
'ursla': 52235,
'completly': 30598,
'moviedom': 40925,
'raddick': 52236,
'brundage': 51995,
'brigades': 40926,
'starring': 1181,
"'goal'": 52237,
'caskets': 52238,
'willcock': 52239,
"threesome's": 52240,
"mosque'": 52241,
"cover's": 52242,
'spaceships': 17637,
'anomalous': 40927,
'ptsd': 27655,
'shirdan': 52243,
'obscenity': 21962,
'lemmings': 30599,
'duccio': 30600,
"levene's": 52244,
"'gorby'": 52245,
"teenager's": 25255,
'marshall': 5340,
'honeymoon': 9095,
'shoots': 3231,
'despised': 12258,
'okabasho': 52246,
'fabric': 8289,
'cannavale': 18515,
'raped': 3537,
"tutt's": 52247,
'grasping': 17638,
'despises': 18516,
"thief's": 40928,
'rapes': 8926,
'raper': 52248,
"eyre'": 27656,
'walchek': 52249,
"elmo's": 23386,
'perfumes': 40929,
'spurting': 21918,
"exposition'\x85": 52250,
'denoting': 52251,
'thesaurus': 34740,
"shoot'": 40930,
'bonejack': 49759,
'simpsonian': 52253,
'hebetude': 30601,
"hallow's": 34741,
'desperation\x85': 52254,
'incinerator': 34742,
'congratulations': 10308,
'humbled': 52255,
"else's": 5924,
'trelkovski': 40845,
"rape'": 52256,
"'chapters'": 59386,
'1600s': 52257,
'martian': 7253,
'nicest': 25256,
'eyred': 52259,
'passenger': 9457,
'disgrace': 6041,
'moderne': 52260,
'barrymore': 5120,
'yankovich': 52261,
'moderns': 40931,
'studliest': 52262,
'bedsheet': 52263,
'decapitation': 14900,
'slurring': 52264,
"'nunsploitation'": 52265,
"'character'": 34743,
'cambodia': 9880,
'rebelious': 52266,
'pasadena': 27657,
'crowne': 40932,
"'bedchamber": 52267,
'conjectural': 52268,
'appologize': 52269,
'halfassing': 52270,
'paycheque': 57816,
'palms': 20606,
"'islands": 52271,
'hawked': 40933,
'palme': 21919,
'conservatively': 40934,
'larp': 64007,
'palma': 5558,
'smelling': 21920,
'aragorn': 12998,
'hawker': 52272,
'hawkes': 52273,
'explosions': 3975,
'loren': 8059,
"pyle's": 52274,
'shootout': 6704,
"mike's": 18517,
"driscoll's": 52275,
'cogsworth': 40935,
"britian's": 52276,
'childs': 34744,
"portrait's": 52277,
'chain': 3626,
'whoever': 2497,
'puttered': 52278,
'childe': 52279,
'maywether': 52280,
'chair': 3036,
"rance's": 52281,
'machu': 34745,
'ballet': 4517,
'grapples': 34746,
'summerize': 76152,
'freelance': 30603,
"andrea's": 52283,
'\x91very': 52284,
'coolidge': 45879,
'mache': 18518,
'balled': 52285,
'grappled': 40937,
'macha': 18519,
'underlining': 21921,
'macho': 5623,
'oversight': 19507,
'machi': 25257,
'verbally': 11311,
'tenacious': 21922,
'windshields': 40938,
'paychecks': 18557,
'jerk': 3396,
"good'": 11931,
'prancer': 34748,
'prances': 21923,
'olympus': 52286,
'lark': 21924,
'embark': 10785,
'gloomy': 7365,
'jehaan': 52287,
'turaqui': 52288,
"child'": 20607,
'locked': 2894,
'pranced': 52289,
'exact': 2588,
'unattuned': 52290,
'minute': 783,
'skewed': 16118,
'hodgins': 40940,
'skewer': 34749,
'think\x85': 52291,
'rosenstein': 38765,
'helmit': 52292,
'wrestlemanias': 34750,
'hindered': 16826,
"martha's": 30604,
'cheree': 52293,
"pluckin'": 52294,
'ogles': 40941,
'heavyweight': 11932,
'aada': 82190,
'chopping': 11312,
'strongboy': 61534,
'hegemonic': 41342,
'adorns': 40942,
'xxth': 41346,
'nobuhiro': 34751,
'capitães': 52298,
'kavogianni': 52299,
'antwerp': 13422,
'celebrated': 6538,
'roarke': 52300,
'baggins': 40943,
'cheeseburgers': 31270,
'matras': 52301,
"nineties'": 52302,
"'craig'": 52303,
'celebrates': 12999,
'unintentionally': 3383,
'drafted': 14362,
'climby': 52304,
'303': 52305,
'oldies': 18520,
'climbs': 9096,
'honour': 9655,
'plucking': 34752,
'305': 30074,
'address': 5514,
'menjou': 40944,
"'freak'": 42592,
'dwindling': 19508,
'benson': 9458,
'white’s': 52307,
'shamelessness': 40945,
'impacted': 21925,
'upatz': 52308,
'cusack': 3840,
"flavia's": 37567,
'effette': 52309,
'influx': 34753,
'boooooooo': 52310,
'dimitrova': 52311,
'houseman': 13423,
'bigas': 25259,
'boylen': 52312,
'phillipenes': 52313,
'fakery': 40946,
"grandpa's": 27658,
'darnell': 27659,
'undergone': 19509,
'handbags': 52315,
'perished': 21926,
'pooped': 37778,
'vigour': 27660,
'opposed': 3627,
'etude': 52316,
"caine's": 11799,
'doozers': 52317,
'photojournals': 34754,
'perishes': 52318,
'constrains': 34755,
'migenes': 40948,
'consoled': 30605,
'alastair': 16827,
'wvs': 52319,
'ooooooh': 52320,
'approving': 34756,
'consoles': 40949,
'disparagement': 52064,
'futureistic': 52322,
'rebounding': 52323,
"'date": 52324,
'gregoire': 52325,
'rutherford': 21927,
'americanised': 34757,
'novikov': 82196,
'following': 1042,
'munroe': 34758,
"morita'": 52326,
'christenssen': 52327,
'oatmeal': 23106,
'fossey': 25260,
'livered': 40950,
'listens': 13000,
"'marci": 76164,
"otis's": 52330,
'thanking': 23387,
'maude': 16019,
'extensions': 34759,
'ameteurish': 52332,
"commender's": 52333,
'agricultural': 27661,
'convincingly': 4518,
'fueled': 17639,
'mahattan': 54014,
"paris's": 40952,
'vulkan': 52336,
'stapes': 52337,
'odysessy': 52338,
'harmon': 12259,
'surfing': 4252,
'halloran': 23494,
'unbelieveably': 49580,
"'offed'": 52339,
'quadrant': 30607,
'inhabiting': 19510,
'nebbish': 34760,
'forebears': 40953,
'skirmish': 34761,
'ocassionally': 52340,
"'resist": 52341,
'impactful': 21928,
'spicier': 52342,
'touristy': 40954,
"'football'": 52343,
'webpage': 40955,
'exurbia': 52345,
'jucier': 52346,
'professors': 14901,
'structuring': 34762,
'jig': 30608,
'overlord': 40956,
'disconnect': 25261,
'sniffle': 82201,
'slimeball': 40957,
'jia': 40958,
'milked': 16828,
'banjoes': 40959,
'jim': 1237,
'workforces': 52348,
'jip': 52349,
'rotweiller': 52350,
'mundaneness': 34763,
"'ninja'": 52351,
"dead'": 11040,
"cipriani's": 40960,
'modestly': 20608,
"professor'": 52352,
'shacked': 40961,
'bashful': 34764,
'sorter': 23388,
'overpowering': 16120,
'workmanlike': 18521,
'henpecked': 27662,
'sorted': 18522,
"jōb's": 52354,
"'always": 52355,
"'baptists": 34765,
'dreamcatchers': 52356,
"'silence'": 52357,
'hickory': 21929,
'fun\x97yet': 52358,
'breakumentary': 52359,
'didn': 15496,
'didi': 52360,
'pealing': 52361,
'dispite': 40962,
"italy's": 25262,
'instability': 21930,
'quarter': 6539,
'quartet': 12608,
'padmé': 52362,
"'bleedmedry": 52363,
'pahalniuk': 52364,
'honduras': 52365,
'bursting': 10786,
"pablo's": 41465,
'irremediably': 52367,
'presages': 40963,
'bowlegged': 57832,
'dalip': 65183,
'entering': 6260,
'newsradio': 76172,
'presaged': 54150,
"giallo's": 27663,
'bouyant': 40964,
'amerterish': 52368,
'rajni': 18523,
'leeves': 30610,
'macauley': 34767,
'seriously': 612,
'sugercoma': 52369,
'grimstead': 52370,
"'fairy'": 52371,
'zenda': 30611,
"'twins'": 52372,
'realisation': 17640,
'highsmith': 27664,
'raunchy': 7817,
'incentives': 40965,
'flatson': 52374,
'snooker': 35097,
'crazies': 16829,
'crazier': 14902,
'grandma': 7094,
'napunsaktha': 52375,
'workmanship': 30612,
'reisner': 52376,
"sanford's": 61306,
'\x91doña': 52377,
'modest': 6108,
"everything's": 19153,
'hamer': 40966,
"couldn't'": 52379,
'quibble': 13001,
'socking': 52380,
'tingler': 21931,
'gutman': 52381,
'lachlan': 40967,
'tableaus': 52382,
'headbanger': 52383,
'spoken': 2847,
'cerebrally': 34768,
"'road": 23490,
'tableaux': 21932,
"proust's": 40968,
'periodical': 40969,
"shoveller's": 52385,
'tamara': 25263,
'affords': 17641,
'concert': 3249,
"yara's": 87955,
'someome': 52386,
'lingering': 8424,
"abraham's": 41511,
'beesley': 34769,
'cherbourg': 34770,
'kagan': 28624,
'snatch': 9097,
"miyazaki's": 9260,
'absorbs': 25264,
"koltai's": 40970,
'tingled': 64027,
'crossroads': 19511,
'rehab': 16121,
'falworth': 52389,
'sequals': 52390,
...}
词表增加特殊字符标识:
word2id = {k:(v+1) for k, v in word_index.items()}
word2id['' ] = 0
word2id['' ] = 1
构造id和word的映射表
id2word = {v:k for k, v in word2id.items()}
def get_words(sent_ids):
return ' '.join([id2word.get(i+1,"" ) for i in sent_ids])
sent = get_words(train_x[0])
print(sent)
the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over landed for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but when from one bit then have two of script their with her nobody most that with wasn't to with armed acting watch an for with heartfelt film want an
训练集和测试集词的整数列表处理:将原始数据+1
train_text = np.array([[i+1 for i in text] for text in train_x])
test_text = np.array([[i+1 for i in text] for text in test_x])
train_text.shape
(25000,)
test_text.shape
(25000,)
tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.)
参数
sequences:浮点数或整数构成的两层嵌套列表
maxlen:None或整数,为序列的最大长度。大于此长度的序列将被截短,小于此长度的序列将在后部填0.
dtype:返回的numpy array的数据类型
padding:‘pre’或‘post’,确定当需要补0时,在序列的起始(pre)还是结尾(post)补,默认pre
truncating:‘pre’或‘post’,确定当需要截断序列时,从起始(pre)还是结尾(post)截断,默认pre
value:浮点数,此值将在填充时代替默认的填充值0
a=[[1,2,3],[4,5,6,7]]
bs_packed = tf.keras.preprocessing.sequence.pad_sequences(a,maxlen=4,padding='post',truncating='post',value = 0)
print(bs_packed)
[[1 2 3 0]
[4 5 6 7]]
# 句子末尾padding
train_data = tf.keras.preprocessing.sequence.pad_sequences(
train_text, value=word2id['' ],
padding='post',truncating='post',maxlen=256
)
test_data = tf.keras.preprocessing.sequence.pad_sequences(
test_text, value=word2id['' ],
padding='post', truncating='post',maxlen=256
)
train_data[0]
array([ 2, 15, 23, 17, 44, 531, 974, 1623, 1386, 66, 459,
4469, 67, 3942, 5, 174, 37, 257, 6, 26, 101, 44,
839, 113, 51, 671, 3, 10, 36, 481, 285, 6, 151,
5, 173, 113, 168, 3, 337, 386, 40, 5, 173, 4537,
1112, 18, 547, 39, 14, 448, 5, 193, 51, 17, 7,
148, 2026, 20, 15, 23, 5, 1921, 4614, 470, 5, 23,
72, 88, 13, 17, 44, 531, 39, 77, 16, 14, 1248,
5, 23, 18, 516, 18, 13, 17, 627, 19, 3, 6,
63, 387, 13, 9, 317, 9, 107, 6, 5, 2224, 5245,
17, 481, 67, 3786, 34, 5, 131, 13, 17, 39, 620,
6, 26, 125, 52, 37, 136, 49, 26, 1416, 34, 7,
23, 13, 216, 29, 78, 53, 6, 15, 408, 17, 83,
3, 9, 5, 108, 118, 5953, 16, 257, 5, 3, 8,
3767, 6, 724, 37, 72, 44, 531, 477, 27, 401, 318,
47, 8, 5, 3, 1030, 14, 105, 89, 5, 382, 16,
298, 99, 33, 2072, 57, 27, 142, 7, 195, 7487, 19,
5, 227, 23, 22, 135, 477, 27, 481, 6, 145, 31,
5536, 19, 52, 37, 29, 225, 93, 26, 105, 5, 227,
66, 17, 39, 1335, 89, 13, 17, 284, 6, 17, 4473,
114, 104, 33, 16, 17, 5346, 20, 179, 33, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0])
# 定义词典大小
vocab_size = len(word2id)
# 定义最大长度
maxlen = 256
def rnn_model():
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
tf.keras.layers.SimpleRNN(64, return_sequences=False),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
return model
rnn_model = rnn_model()
%%time
history1 = rnn_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 81s 4ms/sample - loss: 0.6956 - accuracy: 0.4990 - val_loss: 0.6935 - val_accuracy: 0.4992
Epoch 2/5
20000/20000 [==============================] - 81s 4ms/sample - loss: 0.6582 - accuracy: 0.5728 - val_loss: 0.7296 - val_accuracy: 0.5044
Epoch 3/5
20000/20000 [==============================] - 80s 4ms/sample - loss: 0.6211 - accuracy: 0.5957 - val_loss: 0.7487 - val_accuracy: 0.5166
Epoch 4/5
20000/20000 [==============================] - 79s 4ms/sample - loss: 0.5420 - accuracy: 0.6313 - val_loss: 0.7912 - val_accuracy: 0.5070
Epoch 5/5
20000/20000 [==============================] - 79s 4ms/sample - loss: 0.4988 - accuracy: 0.6471 - val_loss: 0.8496 - val_accuracy: 0.5104
Wall time: 6min 39s
%%time
rnn_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 23s - loss: 0.8093 - accuracy: 0.5141
Wall time: 23.1 s
[0.8431943800735474, 0.51408]
def lstm_model():
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
tf.keras.layers.LSTM(64, return_sequences=False),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
return model
lstm_model = lstm_model()
lstm_model.summary()
Model: "sequential_15"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_5 (Embedding) (None, 256, 200) 17717200
_________________________________________________________________
lstm_28 (LSTM) (None, 64) 67840
_________________________________________________________________
dense_5 (Dense) (None, 1) 65
=================================================================
Total params: 17,785,105
Trainable params: 17,785,105
Non-trainable params: 0
_________________________________________________________________
%%time
history2 = lstm_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 118s 6ms/sample - loss: 0.6880 - accuracy: 0.5357 - val_loss: 0.6775 - val_accuracy: 0.5706
Epoch 2/5
20000/20000 [==============================] - 116s 6ms/sample - loss: 0.6560 - accuracy: 0.5788 - val_loss: 0.6639 - val_accuracy: 0.5706
Epoch 3/5
20000/20000 [==============================] - 114s 6ms/sample - loss: 0.6202 - accuracy: 0.6159 - val_loss: 0.6859 - val_accuracy: 0.5394
Epoch 4/5
20000/20000 [==============================] - 115s 6ms/sample - loss: 0.4847 - accuracy: 0.7706 - val_loss: 0.4180 - val_accuracy: 0.8340
Epoch 5/5
20000/20000 [==============================] - 115s 6ms/sample - loss: 0.2744 - accuracy: 0.8967 - val_loss: 0.3724 - val_accuracy: 0.8492
Wall time: 9min 39s
%%time
lstm_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 39s - loss: 0.3878 - accuracy: 0.8455
Wall time: 38.6 s
[0.3856367552185059, 0.84552]
def gru_model():
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
tf.keras.layers.GRU(64, return_sequences=False),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
return model
gru_model = gru_model()
gru_model.summary()
Model: "sequential_16"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_6 (Embedding) (None, 256, 200) 17717200
_________________________________________________________________
gru_1 (GRU) (None, 64) 51072
_________________________________________________________________
dense_6 (Dense) (None, 1) 65
=================================================================
Total params: 17,768,337
Trainable params: 17,768,337
Non-trainable params: 0
_________________________________________________________________
%%time
history3 = gru_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 109s 5ms/sample - loss: 0.6930 - accuracy: 0.5092 - val_loss: 0.6911 - val_accuracy: 0.5252
Epoch 2/5
20000/20000 [==============================] - 108s 5ms/sample - loss: 0.6596 - accuracy: 0.5753 - val_loss: 0.7330 - val_accuracy: 0.5642
Epoch 3/5
20000/20000 [==============================] - 108s 5ms/sample - loss: 0.5531 - accuracy: 0.7174 - val_loss: 0.5063 - val_accuracy: 0.7864
Epoch 4/5
20000/20000 [==============================] - 107s 5ms/sample - loss: 0.5976 - accuracy: 0.6156 - val_loss: 0.7025 - val_accuracy: 0.5920
Epoch 5/5
20000/20000 [==============================] - 107s 5ms/sample - loss: 0.3911 - accuracy: 0.8382 - val_loss: 0.5317 - val_accuracy: 0.7678
Wall time: 8min 58s
%%time
gru_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 36s - loss: 0.4674 - accuracy: 0.7724
Wall time: 36.2 s
[0.529171141052246, 0.77236]
def bilstm_model():
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
return model
bilstm_model = bilstm_model()
bilstm_model.summary()
Model: "sequential_17"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_7 (Embedding) (None, 256, 200) 17717200
_________________________________________________________________
bidirectional_22 (Bidirectio (None, 128) 135680
_________________________________________________________________
dense_7 (Dense) (None, 1) 129
=================================================================
Total params: 17,853,009
Trainable params: 17,853,009
Non-trainable params: 0
_________________________________________________________________
%%time
history4 = bilstm_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 241s 12ms/sample - loss: 0.4589 - accuracy: 0.7806 - val_loss: 0.3876 - val_accuracy: 0.8356
Epoch 2/5
20000/20000 [==============================] - 239s 12ms/sample - loss: 0.2686 - accuracy: 0.8963 - val_loss: 0.3592 - val_accuracy: 0.8670
Epoch 3/5
20000/20000 [==============================] - 242s 12ms/sample - loss: 0.1854 - accuracy: 0.9333 - val_loss: 0.3503 - val_accuracy: 0.8638
Epoch 4/5
20000/20000 [==============================] - 245s 12ms/sample - loss: 0.1387 - accuracy: 0.9500 - val_loss: 0.4455 - val_accuracy: 0.8556
Epoch 5/5
20000/20000 [==============================] - 244s 12ms/sample - loss: 0.0940 - accuracy: 0.9685 - val_loss: 0.5017 - val_accuracy: 0.8486
Wall time: 20min 10s
%%time
bilstm_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 50s - loss: 0.6497 - accuracy: 0.8325
Wall time: 50.3 s
[0.5397518084144592, 0.83252]