#比较两个文件相同记录,并将相同记录写入文件中
import time
time1 = time.time()
print(time1)
with open('sample_submission.csv') as file_object:
lines_1 = file_object.readlines()
file_line={}
for line_1 in lines_1:
line_1 = line_1.rstrip()
line_len1 = len(line_1)
my_hash = 0
for i in range(0,line_len1):
my_hash = my_hash*33 + ord((line_1[i:i+1]))
if my_hash < 0:
my_hash = my_hash * (-1)
file_line[my_hash]=line_1
with open('sample_submission1.csv') as file_object1:
with open('result.csv', 'w') as file_object2:
lines_2 = file_object1.readlines()
for line_2 in lines_2:
line_2 = line_2.rstrip()
line_len2 = len(line_2)
hash_value = 0
for i in range(0, line_len2):
hash_value = hash_value*33 + ord((line_2[i:i+1]))
if hash_value < 0:
hash_value = hash_value * (-1)
if hash_value in file_line.keys():
result_line = file_line.get(hash_value) + '\n'
file_object2.write(result_line)
time2 = time.time()
print(time2)
print('比较两文件消耗时间为:' + str(time2-time1))
两文件记录在11万左右,相同记录7万左右
执行结果:
1527750785.3048437
1527750790.127326
比较两文件消耗时间为:4.822482347488403
1527750811.9845114
1527750816.800993
比较两文件消耗时间为:4.816481590270996
1527750834.5217652
1527750839.1572285
比较两文件消耗时间为:4.635463237762451
#比较两个文件相同记录,并将相同记录写入文件中
import time
time1 = time.time()
print(time1)
def bin_qry(search_line, max_index, file_line):
#二分查找算法
start = 0
end = max_index -1
while start <= end:
mid = int(start + (end-start)/2)
if search_line == file_line[mid]:
return 0
elif search_line > file_line[mid]:
start = mid + 1
elif search_line < file_line[mid]:
end = mid - 1
return -1
with open('sample_submission.csv') as file_object:
lines_1 = file_object.readlines()
file_line=[]
for line_1 in lines_1:
line_1 = line_1.rstrip()
file_line.append(line_1)
file_line.sort()
sum_line = len(file_line)
with open('sample_submission1.csv') as file_object1:
with open('result.csv', 'w') as file_object2:
lines_2 = file_object1.readlines()
for line_2 in lines_2:
line_2 = line_2.rstrip()
res = bin_qry(line_2, sum_line, file_line)
if res == 0:
file_object2.write(line_2 + '\n')
time2 = time.time()
print(time2)
print(time2 - time1)
执行结果:
1527750971.1994314
1527750974.4837599
3.2843284606933594
1527750990.768388
1527750993.9247038
3.156315803527832
1527751010.618373
1527751013.8136923
3.195319414138794