def _read_words(filename): with tf.gfile.GFile(filename, "r") as f: if Py3: return f.read().replace("\n", "").split() else: return f.read().decode("utf-8").replace("\n", " ").split() def _build_vocab(filename): data = _read_words(filename) print("data:",data) counter = collections.Counter(data) print("counter:",counter) # temp_pairs = sorted(counter.items(), key=lambda x: print((-x[1], x[0]))) # print(" temp_pairs:", temp_pairs) count_pairs = sorted(counter.items(), key=lambda x: (x[1], x[0])) print(" count_pairs:", count_pairs) words, _ = list(zip(*count_pairs)) print("words:", words) word_to_id = dict(zip(words, range(len(words)))) print(" word_to_id:", word_to_id) return word_to_id def setUp(self): self._string_data = "\n".join( [" hello there i am", " rain as day", " want some cheesy puffs wu"]) def testPtbRawData(self): tmpdir = tf.test.get_temp_dir() for suffix in "train", "valid", "test": filename = os.path.join(tmpdir, "ptb.%s.txt" % suffix) with tf.gfile.GFile(filename, "w") as fh: fh.write(self._string_data) # Smoke test output = reader.ptb_raw_data(tmpdir) 打印结果 data Tensor("PTBProducer_1/Reshape:0", shape=(3, ?), dtype=int32) ..data: ['hello', 'there', 'i', 'am ', 'rain', 'as', 'day ', 'want', 'some', 'cheesy', 'puffs', 'wu'] counter: Counter({'there': 1, 'wu': 1, 'as': 1, 'hello': 1, 'puffs': 1, 'am ': 1, 'cheesy': 1, 'day ': 1, 'some': 1, 'i': 1, 'rain': 1, 'want': 1}) count_pairs: [('am ', 1), ('as', 1), ('cheesy', 1), ('day ', 1), ('hello', 1), ('i', 1), ('puffs', 1), ('rain', 1), ('some', 1), ('there', 1), ('want', 1), ('wu', 1)] words: ('am ', 'as', 'cheesy', 'day ', 'hello', 'i', 'puffs', 'rain', 'some', 'there', 'want', 'wu') word_to_id: {'want': 10, 'there': 9, 'day ': 3, 'wu': 11, 'hello': 4, 'puffs': 6, 'am ': 0, 'cheesy': 2, 'rain': 7, 'some': 8, 'i': 5, 'as': 1}