hive表导入外部数据

1.数据生成

生成1亿条数据到文件data_gen_multi_proc_xxxx.txt中,cpu8核耗时4min左右

# -*- coding: utf-8 -*-
import datetime
import random
from multiprocessing import Pool, cpu_count


def generate_data(num):
    data = []
    for _ in range(num):
        line = ' '.join(str(random.randint(1, 9999)) for _ in range(5))
        data.append(line)
    return '\n'.join(data)


if __name__ == '__main__':
    startTime = datetime.datetime.now()
    # 要生成数据的总行数,此处为100万行
    num_lines = 100000000
    p = Pool(cpu_count())
    filename = 'data_gen_multi_proc_' + str(num_lines) + '.txt'
    with open(filename, 'w') as f:
        with Pool(processes=cpu_count()) as p:
            results = p.map(generate_data, [num_lines//cpu_count()] * cpu_count())
        f.write('\n'.join(results))

    endTime = datetime.datetime.now()
    print

你可能感兴趣的:(大数据,hive,hadoop,数据仓库)