

import tensorflow as tf


(train_x, train_y), (test_x, test_y)=imdb.load_data(num_words = 10000)
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 21s 1us/step

参数num_words = 10000的意思是仅保留训练数据的前10000个最常见出现的单词,低频单词将被舍弃。这样得到的向量数据不会太大,便于处理。



[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]



例如一句话:str_word = “stay strong Wuhan!stay strong China!”

1.去除空格,统计不重复词到列表:word_list = [“stay”,“strong”,“Wuhan”,"!",“China”],一共5个词

2.word_list列表中每个词用对应的下标索引表示:word_dic = {“stay”:0,“strong”:1,“Wuhan”:2,"!":3,“China”:4}

3.将原来句子中每个词用对应的下标索引(id)表示,句子转为整数序列:int_word = [0,1,2,3,0,1,4,3]



! = [10,12,19]

tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=word_dimension, input_length=maxlen)


Input shape:(batch_size, input_length)

Output shape:(batch_size, input_length, output_dim)


  • input_dim:词的总个数

  • output_dim:词嵌入的维度

  • input_length:当输入序列的长度固定时,该值为其长度。如果要在该层后接Flatten层,然后接Dense层,则必须指定该参数,否则Dense层的输出维度无法自动推断。

data = np.array([[0,1,2],[2,1,1]])
emb = tf.keras.layers.Embedding(input_dim=3, output_dim=3, input_length=5)



word_index = imdb.get_word_index()
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 1s 0us/step
{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,
 '275': 34706,
 'consenting': 27631,
 'snuggled': 40836,
 'inanimate': 15492,
 'uality': 52030,
 'bronte': 11926,
 'errors': 4010,
 'dialogs': 3230,
 "yomada's": 52031,
 "madman's": 34707,
 'dialoge': 30585,
 'usenet': 52033,
 'videodrome': 40837,
 "kid'": 26338,
 'pawed': 52034,
 "'girlfriend'": 30569,
 "'pleasure": 52035,
 "'reloaded'": 52036,
 "kazakos'": 40839,
 'rocque': 52037,
 'mailings': 52038,
 'brainwashed': 11927,
 'mcanally': 16819,
 "tom''": 52039,
 'kurupt': 25243,
 'affiliated': 21905,
 'babaganoosh': 52040,
 "noe's": 40840,
 'quart': 40841,
 'kids': 359,
 'uplifting': 5034,
 'controversy': 7093,
 'kida': 21906,
 'kidd': 23379,
 "error'": 52041,
 'neurologist': 52042,
 'spotty': 18510,
 'cobblers': 30570,
 'projection': 9878,
 'fastforwarding': 40842,
 'sters': 52043,
 "eggar's": 52044,
 'etherything': 52045,
 'gateshead': 40843,
 'airball': 34708,
 'unsinkable': 25244,
 'stern': 7180,
 "cervi's": 52046,
 'dnd': 40844,
 'dna': 11586,
 'insecurity': 20598,
 "'reboot'": 52047,
 'trelkovsky': 11037,
 'jaekel': 52048,
 'sidebars': 52049,
 "sforza's": 52050,
 'distortions': 17633,
 'mutinies': 52051,
 'sermons': 30602,
 '7ft': 40846,
 'boobage': 52052,
 "o'bannon's": 52053,
 'populations': 23380,
 'chulak': 52054,
 'mesmerize': 27633,
 'quinnell': 52055,
 'yahoo': 10307,
 'meteorologist': 52057,
 'beswick': 42577,
 'boorman': 15493,
 'voicework': 40847,
 "ster'": 52058,
 'blustering': 22922,
 'hj': 52059,
 'intake': 27634,
 'morally': 5621,
 'jumbling': 40849,
 'bowersock': 52060,
 "'porky's'": 52061,
 'gershon': 16821,
 'ludicrosity': 40850,
 'coprophilia': 52062,
 'expressively': 40851,
 "india's": 19500,
 "post's": 34710,
 'wana': 52063,
 'wang': 5283,
 'wand': 30571,
 'wane': 25245,
 'edgeways': 52321,
 'titanium': 34711,
 'pinta': 40852,
 'want': 178,
 'pinto': 30572,
 'whoopdedoodles': 52065,
 'tchaikovsky': 21908,
 'travel': 2103,
 "'victory'": 52066,
 'copious': 11928,
 'gouge': 22433,
 "chapters'": 52067,
 'barbra': 6702,
 'uselessness': 30573,
 "wan'": 52068,
 'assimilated': 27635,
 'petiot': 16116,
 'most\x85and': 52069,
 'dinosaurs': 3930,
 'wrong': 352,
 'seda': 52070,
 'stollen': 52071,
 'sentencing': 34712,
 'ouroboros': 40853,
 'assimilates': 40854,
 'colorfully': 40855,
 'glenne': 27636,
 'dongen': 52072,
 'subplots': 4760,
 'kiloton': 52073,
 'chandon': 23381,
 "effect'": 34713,
 'snugly': 27637,
 'kuei': 40856,
 'welcomed': 9092,
 'dishonor': 30071,
 'concurrence': 52075,
 'stoicism': 23382,
 "guys'": 14896,
 "beroemd'": 52077,
 'butcher': 6703,
 "melfi's": 40857,
 'aargh': 30623,
 'playhouse': 20599,
 'wickedly': 11308,
 'fit': 1180,
 'labratory': 52078,
 'lifeline': 40859,
 'screaming': 1927,
 'fix': 4287,
 'cineliterate': 52079,
 'fic': 52080,
 'fia': 52081,
 'fig': 34714,
 'fmvs': 52082,
 'fie': 52083,
 'reentered': 52084,
 'fin': 30574,
 'doctresses': 52085,
 'fil': 52086,
 'zucker': 12606,
 'ached': 31931,
 'counsil': 52088,
 'paterfamilias': 52089,
 'songwriter': 13885,
 'shivam': 34715,
 'hurting': 9654,
 'effects': 299,
 'slauther': 52090,
 "'flame'": 52091,
 'sommerset': 52092,
 'interwhined': 52093,
 'whacking': 27638,
 'bartok': 52094,
 'barton': 8775,
 'frewer': 21909,
 "fi'": 52095,
 'ingrid': 6192,
 'stribor': 30575,
 'approporiately': 52096,
 'wobblyhand': 52097,
 'tantalisingly': 52098,
 'ankylosaurus': 52099,
 'parasites': 17634,
 'childen': 52100,
 "jenkins'": 52101,
 'metafiction': 52102,
 'golem': 17635,
 'indiscretion': 40860,
 "reeves'": 23383,
 "inamorata's": 57781,
 'brittannica': 52104,
 'adapt': 7916,
 "russo's": 30576,
 'guitarists': 48246,
 'abbott': 10553,
 'abbots': 40861,
 'lanisha': 17649,
 'magickal': 40863,
 'mattter': 52105,
 "'willy": 52106,
 'pumpkins': 34716,
 'stuntpeople': 52107,
 'estimate': 30577,
 'ugghhh': 40864,
 'gameplay': 11309,
 "wern't": 52108,
 "n'sync": 40865,
 'sickeningly': 16117,
 'chiara': 40866,
 'disturbed': 4011,
 'portmanteau': 40867,
 'ineffectively': 52109,
 "duchonvey's": 82143,
 "nasty'": 37519,
 'purpose': 1285,
 'lazers': 52112,
 'lightened': 28105,
 'kaliganj': 52113,
 'popularism': 52114,
 "damme's": 18511,
 'stylistics': 30578,
 'mindgaming': 52115,
 'spoilerish': 46449,
 "'corny'": 52117,
 'boerner': 34718,
 'olds': 6792,
 'bakelite': 52118,
 'renovated': 27639,
 'forrester': 27640,
 "lumiere's": 52119,
 'gaskets': 52024,
 'needed': 884,
 'smight': 34719,
 'master': 1297,
 "edie's": 25905,
 'seeber': 40868,
 'hiya': 52120,
 'fuzziness': 52121,
 'genesis': 14897,
 'rewards': 12607,
 'enthrall': 30579,
 "'about": 40869,
 "recollection's": 52122,
 'mutilated': 11039,
 'fatherlands': 52123,
 "fischer's": 52124,
 'positively': 5399,
 '270': 34705,
 'ahmed': 34720,
 'zatoichi': 9836,
 'bannister': 13886,
 'anniversaries': 52127,
 "helm's": 30580,
 "'work'": 52128,
 'exclaimed': 34721,
 "'unfunny'": 52129,
 '274': 52029,
 'feeling': 544,
 "wanda's": 52131,
 'dolan': 33266,
 '278': 52133,
 'peacoat': 52134,
 'brawny': 40870,
 'mishra': 40871,
 'worlders': 40872,
 'protags': 52135,
 'skullcap': 52136,
 'dastagir': 57596,
 'affairs': 5622,
 'wholesome': 7799,
 'hymen': 52137,
 'paramedics': 25246,
 'unpersons': 52138,
 'heavyarms': 52139,
 'affaire': 52140,
 'coulisses': 52141,
 'hymer': 40873,
 'kremlin': 52142,
 'shipments': 30581,
 'pixilated': 52143,
 "'00s": 30582,
 'diminishing': 18512,
 'cinematic': 1357,
 'resonates': 14898,
 'simplify': 40874,
 "nature'": 40875,
 'temptresses': 40876,
 'reverence': 16822,
 'resonated': 19502,
 'dailey': 34722,
 '2\x85': 52144,
 'treize': 27641,
 'majo': 52145,
 'kiya': 21910,
 'woolnough': 52146,
 'thanatos': 39797,
 'sandoval': 35731,
 'dorama': 40879,
 "o'shaughnessy": 52147,
 'tech': 4988,
 'fugitives': 32018,
 'teck': 30583,
 "'e'": 76125,
 'doesn’t': 40881,
 'purged': 52149,
 'saying': 657,
 "martians'": 41095,
 'norliss': 23418,
 'dickey': 27642,
 'dicker': 52152,
 "'sependipity": 52153,
 'padded': 8422,
 'ordell': 57792,
 "sturges'": 40882,
 'independentcritics': 52154,
 'tempted': 5745,
 "atkinson's": 34724,
 'hounded': 25247,
 'apace': 52155,
 'clicked': 15494,
 "'humor'": 30584,
 "martino's": 17177,
 "'supporting": 52156,
 'warmongering': 52032,
 "zemeckis's": 34725,
 'lube': 21911,
 'shocky': 52157,
 'plate': 7476,
 'plata': 40883,
 'sturgess': 40884,
 "nerds'": 40885,
 'plato': 20600,
 'plath': 34726,
 'platt': 40886,
 'mcnab': 52159,
 'clumsiness': 27643,
 'altogether': 3899,
 'massacring': 42584,
 'bicenntinial': 52160,
 'skaal': 40887,
 'droning': 14360,
 'lds': 8776,
 'jaguar': 21912,
 "cale's": 34727,
 'nicely': 1777,
 'mummy': 4588,
 "lot's": 18513,
 'patch': 10086,
 'kerkhof': 50202,
 "leader's": 52161,
 "'movie": 27644,
 'uncomfirmed': 52162,
 'heirloom': 40888,
 'wrangle': 47360,
 'emotion\x85': 52163,
 "'stargate'": 52164,
 'pinoy': 40889,
 'conchatta': 40890,
 'broeke': 41128,
 'advisedly': 40891,
 "barker's": 17636,
 'descours': 52166,
 'lots': 772,
 'lotr': 9259,
 'irs': 9879,
 'lott': 52167,
 'xvi': 40892,
 'irk': 34728,
 'irl': 52168,
 'ira': 6887,
 'belzer': 21913,
 'irc': 52169,
 'ire': 27645,
 'requisites': 40893,
 'discipline': 7693,
 'lyoko': 52961,
 'extend': 11310,
 'nature': 873,
 "'dickie'": 52170,
 'optimist': 40894,
 'lapping': 30586,
 'superficial': 3900,
 'vestment': 52171,
 'extent': 2823,
 'tendons': 52172,
 "heller's": 52173,
 'quagmires': 52174,
 'miyako': 52175,
 'moocow': 20601,
 "coles'": 52176,
 'lookit': 40895,
 'ravenously': 52177,
 'levitating': 40896,
 'perfunctorily': 52178,
 'lookin': 30587,
 "lot'": 40898,
 'lookie': 52179,
 'fearlessly': 34870,
 'libyan': 52181,
 'fondles': 40899,
 'gopher': 35714,
 'wearying': 40901,
 "nz's": 52182,
 'minuses': 27646,
 'puposelessly': 52183,
 'shandling': 52184,
 'decapitates': 31268,
 'humming': 11929,
 "'nother": 40902,
 'smackdown': 21914,
 'underdone': 30588,
 'frf': 40903,
 'triviality': 52185,
 'fro': 25248,
 'bothers': 8777,
 "'kensington": 52186,
 'much': 73,
 'muco': 34730,
 'wiseguy': 22615,
 "richie's": 27648,
 'tonino': 40904,
 'unleavened': 52187,
 'fry': 11587,
 "'tv'": 40905,
 'toning': 40906,
 'obese': 14361,
 'sensationalized': 30589,
 'spiv': 40907,
 'spit': 6259,
 'arkin': 7364,
 'charleton': 21915,
 'jeon': 16823,
 'boardroom': 21916,
 'doubts': 4989,
 'spin': 3084,
 'hepo': 53083,
 'wildcat': 27649,
 'venoms': 10584,
 'misconstrues': 52191,
 'mesmerising': 18514,
 'misconstrued': 40908,
 'rescinds': 52192,
 'prostrate': 52193,
 'majid': 40909,
 'climbed': 16479,
 'canoeing': 34731,
 'majin': 52195,
 'animie': 57804,
 'sylke': 40910,
 'conditioned': 14899,
 'waddell': 40911,
 '3\x85': 52196,
 'hyperdrive': 41188,
 'conditioner': 34732,
 'bricklayer': 53153,
 'hong': 2576,
 'memoriam': 52198,
 'inventively': 30592,
 "levant's": 25249,
 'portobello': 20638,
 'remand': 52200,
 'mummified': 19504,
 'honk': 27650,
 'spews': 19505,
 'visitations': 40912,
 'mummifies': 52201,
 'cavanaugh': 25250,
 'zeon': 23385,
 "jungle's": 40913,
 'viertel': 34733,
 'frenchmen': 27651,
 'torpedoes': 52202,
 'schlessinger': 52203,
 'torpedoed': 34734,
 'blister': 69876,
 'cinefest': 52204,
 'furlough': 34735,
 'mainsequence': 52205,
 'mentors': 40914,
 'academic': 9094,
 'stillness': 20602,
 'academia': 40915,
 'lonelier': 52206,
 'nibby': 52207,
 "losers'": 52208,
 'cineastes': 40916,
 'corporate': 4449,
 'massaging': 40917,
 'bellow': 30593,
 'absurdities': 19506,
 'expetations': 53241,
 'nyfiken': 40918,
 'mehras': 75638,
 'lasse': 52209,
 'visability': 52210,
 'militarily': 33946,
 "elder'": 52211,
 'gainsbourg': 19023,
 'hah': 20603,
 'hai': 13420,
 'haj': 34736,
 'hak': 25251,
 'hal': 4311,
 'ham': 4892,
 'duffer': 53259,
 'haa': 52213,
 'had': 66,
 'advancement': 11930,
 'hag': 16825,
 "hand'": 25252,
 'hay': 13421,
 'mcnamara': 20604,
 "mozart's": 52214,
 'duffel': 30731,
 'haq': 30594,
 'har': 13887,
 'has': 44,
 'hat': 2401,
 'hav': 40919,
 'haw': 30595,
 'figtings': 52215,
 'elders': 15495,
 'underpanted': 52216,
 'pninson': 52217,
 'unequivocally': 27652,
 "barbara's": 23673,
 "bello'": 52219,
 'indicative': 12997,
 'yawnfest': 40920,
 'hexploitation': 52220,
 "loder's": 52221,
 'sleuthing': 27653,
 "justin's": 32622,
 "'ball": 52222,
 "'summer": 52223,
 "'demons'": 34935,
 "mormon's": 52225,
 "laughton's": 34737,
 'debell': 52226,
 'shipyard': 39724,
 'unabashedly': 30597,
 'disks': 40401,
 'crowd': 2290,
 'crowe': 10087,
 "vancouver's": 56434,
 'mosques': 34738,
 'crown': 6627,
 'culpas': 52227,
 'crows': 27654,
 'surrell': 53344,
 'flowless': 52229,
 'sheirk': 52230,
 "'three": 40923,
 "peterson'": 52231,
 'ooverall': 52232,
 'perchance': 40924,
 'bottom': 1321,
 'chabert': 53363,
 'sneha': 52233,
 'inhuman': 13888,
 'ichii': 52234,
 'ursla': 52235,
 'completly': 30598,
 'moviedom': 40925,
 'raddick': 52236,
 'brundage': 51995,
 'brigades': 40926,
 'starring': 1181,
 "'goal'": 52237,
 'caskets': 52238,
 'willcock': 52239,
 "threesome's": 52240,
 "mosque'": 52241,
 "cover's": 52242,
 'spaceships': 17637,
 'anomalous': 40927,
 'ptsd': 27655,
 'shirdan': 52243,
 'obscenity': 21962,
 'lemmings': 30599,
 'duccio': 30600,
 "levene's": 52244,
 "'gorby'": 52245,
 "teenager's": 25255,
 'marshall': 5340,
 'honeymoon': 9095,
 'shoots': 3231,
 'despised': 12258,
 'okabasho': 52246,
 'fabric': 8289,
 'cannavale': 18515,
 'raped': 3537,
 "tutt's": 52247,
 'grasping': 17638,
 'despises': 18516,
 "thief's": 40928,
 'rapes': 8926,
 'raper': 52248,
 "eyre'": 27656,
 'walchek': 52249,
 "elmo's": 23386,
 'perfumes': 40929,
 'spurting': 21918,
 "exposition'\x85": 52250,
 'denoting': 52251,
 'thesaurus': 34740,
 "shoot'": 40930,
 'bonejack': 49759,
 'simpsonian': 52253,
 'hebetude': 30601,
 "hallow's": 34741,
 'desperation\x85': 52254,
 'incinerator': 34742,
 'congratulations': 10308,
 'humbled': 52255,
 "else's": 5924,
 'trelkovski': 40845,
 "rape'": 52256,
 "'chapters'": 59386,
 '1600s': 52257,
 'martian': 7253,
 'nicest': 25256,
 'eyred': 52259,
 'passenger': 9457,
 'disgrace': 6041,
 'moderne': 52260,
 'barrymore': 5120,
 'yankovich': 52261,
 'moderns': 40931,
 'studliest': 52262,
 'bedsheet': 52263,
 'decapitation': 14900,
 'slurring': 52264,
 "'nunsploitation'": 52265,
 "'character'": 34743,
 'cambodia': 9880,
 'rebelious': 52266,
 'pasadena': 27657,
 'crowne': 40932,
 "'bedchamber": 52267,
 'conjectural': 52268,
 'appologize': 52269,
 'halfassing': 52270,
 'paycheque': 57816,
 'palms': 20606,
 "'islands": 52271,
 'hawked': 40933,
 'palme': 21919,
 'conservatively': 40934,
 'larp': 64007,
 'palma': 5558,
 'smelling': 21920,
 'aragorn': 12998,
 'hawker': 52272,
 'hawkes': 52273,
 'explosions': 3975,
 'loren': 8059,
 "pyle's": 52274,
 'shootout': 6704,
 "mike's": 18517,
 "driscoll's": 52275,
 'cogsworth': 40935,
 "britian's": 52276,
 'childs': 34744,
 "portrait's": 52277,
 'chain': 3626,
 'whoever': 2497,
 'puttered': 52278,
 'childe': 52279,
 'maywether': 52280,
 'chair': 3036,
 "rance's": 52281,
 'machu': 34745,
 'ballet': 4517,
 'grapples': 34746,
 'summerize': 76152,
 'freelance': 30603,
 "andrea's": 52283,
 '\x91very': 52284,
 'coolidge': 45879,
 'mache': 18518,
 'balled': 52285,
 'grappled': 40937,
 'macha': 18519,
 'underlining': 21921,
 'macho': 5623,
 'oversight': 19507,
 'machi': 25257,
 'verbally': 11311,
 'tenacious': 21922,
 'windshields': 40938,
 'paychecks': 18557,
 'jerk': 3396,
 "good'": 11931,
 'prancer': 34748,
 'prances': 21923,
 'olympus': 52286,
 'lark': 21924,
 'embark': 10785,
 'gloomy': 7365,
 'jehaan': 52287,
 'turaqui': 52288,
 "child'": 20607,
 'locked': 2894,
 'pranced': 52289,
 'exact': 2588,
 'unattuned': 52290,
 'minute': 783,
 'skewed': 16118,
 'hodgins': 40940,
 'skewer': 34749,
 'think\x85': 52291,
 'rosenstein': 38765,
 'helmit': 52292,
 'wrestlemanias': 34750,
 'hindered': 16826,
 "martha's": 30604,
 'cheree': 52293,
 "pluckin'": 52294,
 'ogles': 40941,
 'heavyweight': 11932,
 'aada': 82190,
 'chopping': 11312,
 'strongboy': 61534,
 'hegemonic': 41342,
 'adorns': 40942,
 'xxth': 41346,
 'nobuhiro': 34751,
 'capitães': 52298,
 'kavogianni': 52299,
 'antwerp': 13422,
 'celebrated': 6538,
 'roarke': 52300,
 'baggins': 40943,
 'cheeseburgers': 31270,
 'matras': 52301,
 "nineties'": 52302,
 "'craig'": 52303,
 'celebrates': 12999,
 'unintentionally': 3383,
 'drafted': 14362,
 'climby': 52304,
 '303': 52305,
 'oldies': 18520,
 'climbs': 9096,
 'honour': 9655,
 'plucking': 34752,
 '305': 30074,
 'address': 5514,
 'menjou': 40944,
 "'freak'": 42592,
 'dwindling': 19508,
 'benson': 9458,
 'white’s': 52307,
 'shamelessness': 40945,
 'impacted': 21925,
 'upatz': 52308,
 'cusack': 3840,
 "flavia's": 37567,
 'effette': 52309,
 'influx': 34753,
 'boooooooo': 52310,
 'dimitrova': 52311,
 'houseman': 13423,
 'bigas': 25259,
 'boylen': 52312,
 'phillipenes': 52313,
 'fakery': 40946,
 "grandpa's": 27658,
 'darnell': 27659,
 'undergone': 19509,
 'handbags': 52315,
 'perished': 21926,
 'pooped': 37778,
 'vigour': 27660,
 'opposed': 3627,
 'etude': 52316,
 "caine's": 11799,
 'doozers': 52317,
 'photojournals': 34754,
 'perishes': 52318,
 'constrains': 34755,
 'migenes': 40948,
 'consoled': 30605,
 'alastair': 16827,
 'wvs': 52319,
 'ooooooh': 52320,
 'approving': 34756,
 'consoles': 40949,
 'disparagement': 52064,
 'futureistic': 52322,
 'rebounding': 52323,
 "'date": 52324,
 'gregoire': 52325,
 'rutherford': 21927,
 'americanised': 34757,
 'novikov': 82196,
 'following': 1042,
 'munroe': 34758,
 "morita'": 52326,
 'christenssen': 52327,
 'oatmeal': 23106,
 'fossey': 25260,
 'livered': 40950,
 'listens': 13000,
 "'marci": 76164,
 "otis's": 52330,
 'thanking': 23387,
 'maude': 16019,
 'extensions': 34759,
 'ameteurish': 52332,
 "commender's": 52333,
 'agricultural': 27661,
 'convincingly': 4518,
 'fueled': 17639,
 'mahattan': 54014,
 "paris's": 40952,
 'vulkan': 52336,
 'stapes': 52337,
 'odysessy': 52338,
 'harmon': 12259,
 'surfing': 4252,
 'halloran': 23494,
 'unbelieveably': 49580,
 "'offed'": 52339,
 'quadrant': 30607,
 'inhabiting': 19510,
 'nebbish': 34760,
 'forebears': 40953,
 'skirmish': 34761,
 'ocassionally': 52340,
 "'resist": 52341,
 'impactful': 21928,
 'spicier': 52342,
 'touristy': 40954,
 "'football'": 52343,
 'webpage': 40955,
 'exurbia': 52345,
 'jucier': 52346,
 'professors': 14901,
 'structuring': 34762,
 'jig': 30608,
 'overlord': 40956,
 'disconnect': 25261,
 'sniffle': 82201,
 'slimeball': 40957,
 'jia': 40958,
 'milked': 16828,
 'banjoes': 40959,
 'jim': 1237,
 'workforces': 52348,
 'jip': 52349,
 'rotweiller': 52350,
 'mundaneness': 34763,
 "'ninja'": 52351,
 "dead'": 11040,
 "cipriani's": 40960,
 'modestly': 20608,
 "professor'": 52352,
 'shacked': 40961,
 'bashful': 34764,
 'sorter': 23388,
 'overpowering': 16120,
 'workmanlike': 18521,
 'henpecked': 27662,
 'sorted': 18522,
 "jōb's": 52354,
 "'always": 52355,
 "'baptists": 34765,
 'dreamcatchers': 52356,
 "'silence'": 52357,
 'hickory': 21929,
 'fun\x97yet': 52358,
 'breakumentary': 52359,
 'didn': 15496,
 'didi': 52360,
 'pealing': 52361,
 'dispite': 40962,
 "italy's": 25262,
 'instability': 21930,
 'quarter': 6539,
 'quartet': 12608,
 'padmé': 52362,
 "'bleedmedry": 52363,
 'pahalniuk': 52364,
 'honduras': 52365,
 'bursting': 10786,
 "pablo's": 41465,
 'irremediably': 52367,
 'presages': 40963,
 'bowlegged': 57832,
 'dalip': 65183,
 'entering': 6260,
 'newsradio': 76172,
 'presaged': 54150,
 "giallo's": 27663,
 'bouyant': 40964,
 'amerterish': 52368,
 'rajni': 18523,
 'leeves': 30610,
 'macauley': 34767,
 'seriously': 612,
 'sugercoma': 52369,
 'grimstead': 52370,
 "'fairy'": 52371,
 'zenda': 30611,
 "'twins'": 52372,
 'realisation': 17640,
 'highsmith': 27664,
 'raunchy': 7817,
 'incentives': 40965,
 'flatson': 52374,
 'snooker': 35097,
 'crazies': 16829,
 'crazier': 14902,
 'grandma': 7094,
 'napunsaktha': 52375,
 'workmanship': 30612,
 'reisner': 52376,
 "sanford's": 61306,
 '\x91doña': 52377,
 'modest': 6108,
 "everything's": 19153,
 'hamer': 40966,
 "couldn't'": 52379,
 'quibble': 13001,
 'socking': 52380,
 'tingler': 21931,
 'gutman': 52381,
 'lachlan': 40967,
 'tableaus': 52382,
 'headbanger': 52383,
 'spoken': 2847,
 'cerebrally': 34768,
 "'road": 23490,
 'tableaux': 21932,
 "proust's": 40968,
 'periodical': 40969,
 "shoveller's": 52385,
 'tamara': 25263,
 'affords': 17641,
 'concert': 3249,
 "yara's": 87955,
 'someome': 52386,
 'lingering': 8424,
 "abraham's": 41511,
 'beesley': 34769,
 'cherbourg': 34770,
 'kagan': 28624,
 'snatch': 9097,
 "miyazaki's": 9260,
 'absorbs': 25264,
 "koltai's": 40970,
 'tingled': 64027,
 'crossroads': 19511,
 'rehab': 16121,
 'falworth': 52389,
 'sequals': 52390,


  • PAD:短句子补长的标识符
  • UNK:词表中未出现过的词
word2id = {k:(v+1) for k, v in word_index.items()}
word2id[''] = 0
word2id[''] = 1


id2word = {v:k for k, v in word2id.items()}
def get_words(sent_ids):
    return ' '.join([id2word.get(i+1,"") for i in sent_ids])
sent = get_words(train_x[0])
the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over landed for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but when from one bit then have two of script their with her nobody most that with wasn't to with armed acting watch an for with heartfelt film want an


train_text = np.array([[i+1 for i in text] for text in train_x])
test_text = np.array([[i+1 for i in text] for text in test_x])


tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.)


  • sequences:浮点数或整数构成的两层嵌套列表

  • maxlen:None或整数,为序列的最大长度。大于此长度的序列将被截短,小于此长度的序列将在后部填0.

  • dtype:返回的numpy array的数据类型

  • padding:‘pre’或‘post’,确定当需要补0时,在序列的起始(pre)还是结尾(post)补,默认pre

  • truncating:‘pre’或‘post’,确定当需要截断序列时,从起始(pre)还是结尾(post)截断,默认pre

  • value:浮点数,此值将在填充时代替默认的填充值0

bs_packed = tf.keras.preprocessing.sequence.pad_sequences(a,maxlen=4,padding='post',truncating='post',value = 0)
[[1 2 3 0]
 [4 5 6 7]]
# 句子末尾padding
train_data = tf.keras.preprocessing.sequence.pad_sequences(
    train_text, value=word2id[''],
test_data = tf.keras.preprocessing.sequence.pad_sequences(
    test_text, value=word2id[''],
    padding='post', truncating='post',maxlen=256
array([   2,   15,   23,   17,   44,  531,  974, 1623, 1386,   66,  459,
       4469,   67, 3942,    5,  174,   37,  257,    6,   26,  101,   44,
        839,  113,   51,  671,    3,   10,   36,  481,  285,    6,  151,
          5,  173,  113,  168,    3,  337,  386,   40,    5,  173, 4537,
       1112,   18,  547,   39,   14,  448,    5,  193,   51,   17,    7,
        148, 2026,   20,   15,   23,    5, 1921, 4614,  470,    5,   23,
         72,   88,   13,   17,   44,  531,   39,   77,   16,   14, 1248,
          5,   23,   18,  516,   18,   13,   17,  627,   19,    3,    6,
         63,  387,   13,    9,  317,    9,  107,    6,    5, 2224, 5245,
         17,  481,   67, 3786,   34,    5,  131,   13,   17,   39,  620,
          6,   26,  125,   52,   37,  136,   49,   26, 1416,   34,    7,
         23,   13,  216,   29,   78,   53,    6,   15,  408,   17,   83,
          3,    9,    5,  108,  118, 5953,   16,  257,    5,    3,    8,
       3767,    6,  724,   37,   72,   44,  531,  477,   27,  401,  318,
         47,    8,    5,    3, 1030,   14,  105,   89,    5,  382,   16,
        298,   99,   33, 2072,   57,   27,  142,    7,  195, 7487,   19,
          5,  227,   23,   22,  135,  477,   27,  481,    6,  145,   31,
       5536,   19,   52,   37,   29,  225,   93,   26,  105,    5,  227,
         66,   17,   39, 1335,   89,   13,   17,  284,    6,   17, 4473,
        114,  104,   33,   16,   17, 5346,   20,  179,   33,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])


# 定义词典大小
vocab_size = len(word2id)
# 定义最大长度
maxlen = 256


def rnn_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
        tf.keras.layers.SimpleRNN(64, return_sequences=False),
        tf.keras.layers.Dense(1, activation='sigmoid')
    return model
rnn_model = rnn_model()
history1 = rnn_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 81s 4ms/sample - loss: 0.6956 - accuracy: 0.4990 - val_loss: 0.6935 - val_accuracy: 0.4992
Epoch 2/5
20000/20000 [==============================] - 81s 4ms/sample - loss: 0.6582 - accuracy: 0.5728 - val_loss: 0.7296 - val_accuracy: 0.5044
Epoch 3/5
20000/20000 [==============================] - 80s 4ms/sample - loss: 0.6211 - accuracy: 0.5957 - val_loss: 0.7487 - val_accuracy: 0.5166
Epoch 4/5
20000/20000 [==============================] - 79s 4ms/sample - loss: 0.5420 - accuracy: 0.6313 - val_loss: 0.7912 - val_accuracy: 0.5070
Epoch 5/5
20000/20000 [==============================] - 79s 4ms/sample - loss: 0.4988 - accuracy: 0.6471 - val_loss: 0.8496 - val_accuracy: 0.5104
Wall time: 6min 39s
rnn_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 23s - loss: 0.8093 - accuracy: 0.5141
Wall time: 23.1 s

[0.8431943800735474, 0.51408]


def lstm_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
        tf.keras.layers.LSTM(64, return_sequences=False),
        tf.keras.layers.Dense(1, activation='sigmoid')
    return model
lstm_model = lstm_model()
Model: "sequential_15"
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 256, 200)          17717200  
lstm_28 (LSTM)               (None, 64)                67840     
dense_5 (Dense)              (None, 1)                 65        
Total params: 17,785,105
Trainable params: 17,785,105
Non-trainable params: 0
history2 = lstm_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 118s 6ms/sample - loss: 0.6880 - accuracy: 0.5357 - val_loss: 0.6775 - val_accuracy: 0.5706
Epoch 2/5
20000/20000 [==============================] - 116s 6ms/sample - loss: 0.6560 - accuracy: 0.5788 - val_loss: 0.6639 - val_accuracy: 0.5706
Epoch 3/5
20000/20000 [==============================] - 114s 6ms/sample - loss: 0.6202 - accuracy: 0.6159 - val_loss: 0.6859 - val_accuracy: 0.5394
Epoch 4/5
20000/20000 [==============================] - 115s 6ms/sample - loss: 0.4847 - accuracy: 0.7706 - val_loss: 0.4180 - val_accuracy: 0.8340
Epoch 5/5
20000/20000 [==============================] - 115s 6ms/sample - loss: 0.2744 - accuracy: 0.8967 - val_loss: 0.3724 - val_accuracy: 0.8492
Wall time: 9min 39s
lstm_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 39s - loss: 0.3878 - accuracy: 0.8455
Wall time: 38.6 s

[0.3856367552185059, 0.84552]


def gru_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
        tf.keras.layers.GRU(64, return_sequences=False),
        tf.keras.layers.Dense(1, activation='sigmoid')
    return model
gru_model = gru_model()
Model: "sequential_16"
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 256, 200)          17717200  
gru_1 (GRU)                  (None, 64)                51072     
dense_6 (Dense)              (None, 1)                 65        
Total params: 17,768,337
Trainable params: 17,768,337
Non-trainable params: 0
history3 = gru_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 109s 5ms/sample - loss: 0.6930 - accuracy: 0.5092 - val_loss: 0.6911 - val_accuracy: 0.5252
Epoch 2/5
20000/20000 [==============================] - 108s 5ms/sample - loss: 0.6596 - accuracy: 0.5753 - val_loss: 0.7330 - val_accuracy: 0.5642
Epoch 3/5
20000/20000 [==============================] - 108s 5ms/sample - loss: 0.5531 - accuracy: 0.7174 - val_loss: 0.5063 - val_accuracy: 0.7864
Epoch 4/5
20000/20000 [==============================] - 107s 5ms/sample - loss: 0.5976 - accuracy: 0.6156 - val_loss: 0.7025 - val_accuracy: 0.5920
Epoch 5/5
20000/20000 [==============================] - 107s 5ms/sample - loss: 0.3911 - accuracy: 0.8382 - val_loss: 0.5317 - val_accuracy: 0.7678
Wall time: 8min 58s
gru_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 36s - loss: 0.4674 - accuracy: 0.7724
Wall time: 36.2 s
[0.529171141052246, 0.77236]


def bilstm_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=maxlen),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    return model
bilstm_model = bilstm_model()
Model: "sequential_17"
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 256, 200)          17717200  
bidirectional_22 (Bidirectio (None, 128)               135680    
dense_7 (Dense)              (None, 1)                 129       
Total params: 17,853,009
Trainable params: 17,853,009
Non-trainable params: 0
history4 = bilstm_model.fit(train_data, train_y, batch_size=64, epochs=5,validation_split=0.2)
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
20000/20000 [==============================] - 241s 12ms/sample - loss: 0.4589 - accuracy: 0.7806 - val_loss: 0.3876 - val_accuracy: 0.8356
Epoch 2/5
20000/20000 [==============================] - 239s 12ms/sample - loss: 0.2686 - accuracy: 0.8963 - val_loss: 0.3592 - val_accuracy: 0.8670
Epoch 3/5
20000/20000 [==============================] - 242s 12ms/sample - loss: 0.1854 - accuracy: 0.9333 - val_loss: 0.3503 - val_accuracy: 0.8638
Epoch 4/5
20000/20000 [==============================] - 245s 12ms/sample - loss: 0.1387 - accuracy: 0.9500 - val_loss: 0.4455 - val_accuracy: 0.8556
Epoch 5/5
20000/20000 [==============================] - 244s 12ms/sample - loss: 0.0940 - accuracy: 0.9685 - val_loss: 0.5017 - val_accuracy: 0.8486
Wall time: 20min 10s
bilstm_model.evaluate(test_data, test_y, batch_size=64,verbose=2)
25000/1 - 50s - loss: 0.6497 - accuracy: 0.8325
Wall time: 50.3 s
[0.5397518084144592, 0.83252]
