python代码 大数据

大数据 - 哈希

教你如何迅速秒杀掉:99%的海量数据处理面试题 http://blog.csdn.net/v_july_v/article/details/7382693

   1:  import operator
   2:  import heapq
   3:   
   4:  def hashfiles():
   5:   
   6:      files = []
   7:      for i in range(0, 10):
   8:          files.append(file(str(i) + '.txt', 'w'))
   9:   
  10:      queryfile = file('./data/queryfile.txt', 'r')
  11:      for query in queryfile:
  12:          files[hash(query)%10].write(query)
  13:   
  14:      queryfile.close()
  15:   
  16:      for f in files:
  17:          f.close()
  18:   
  19:  def sortqueriesinfiles():
  20:      files = []
  21:      for i in range(0, 10):
  22:          files.append(file(str(i) + '.txt', 'r+'))
  23:   
  24:      for f in files:
  25:          D = {}
  26:          for query in f:
  27:              query = query.strip()
  28:              if query in D:
  29:                  D[query] += 1
  30:              else:
  31:                  D[query] = 1
  32:          sorted_D = sorted(D.iteritems(), key = operator.itemgetter(1))
  33:   
  34:          f.seek(0, 0)
  35:          f.truncate()
  36:          for item in sorted_D:
  37:              f.write(item[0] + '\t' + str(item[1]) + '\n')
  38:          f.close()
  39:   
  40:  def iteratefiles(f):
  41:      for line in f:
  42:          query, count = line.split('\t', 1)
  43:          yield (-int(count), query)
  44:   
  45:  def mergefiles():
  46:      files = []
  47:      for i in range(0, 10):
  48:          files.append(file(str(i) + '.txt', 'r'))
  49:   
  50:      dest_file = file('dest.txt', 'w')
  51:   
  52:      for line in heapq.merge(*[iteratefiles(f) for f in files]):
  53:          print line
  54:          dest_file.write(line[1] + '\n')
  55:   
  56:      dest_file.close()
  57:   
  58:      for f in files:
  59:          f.close()
  60:   
  61:  if __name__ == '__main__':
  62:   
  63:      hashfiles()
  64:      sortqueriesinfiles()
  65:      mergefiles()

你可能感兴趣的:(python,import,target,title,blank)