matched = open('matchedrecords.txt','w')
with open('srcone.txt') as b:
blines = set(b)
with open('srctwo.txt') as a:
alines = set(a)
with open('notInfirstSource.txt', 'w') as result:
for line in alines:
if line not in blines:
result.write(line)
else:
matched.write(line)
with open('notInsecondSource.txt', 'w') as non:
for lin in blines:
if lin not in alines:
non.write(lin)
matched.close()
notInfirstSource.txt中保存的是不在srcone.txt中内容。
notInsecondSource.txt中保存的是不在srctwo.txt中的内容。
两者相同的内容保存在matchedrecords.txt
例子:查询bl.txt文件与ba.txt文件的数据是否一致对应
import pandas as pd
def parse_change_Record():
rawsnppath="/Users/joe/workspace/diffrentgt/ba.txt"
fixsnppath = "/Users/joe/workspace/diffrentgt/bl.txt"
rawsnp = pd.read_table(rawsnppath, dtype=str)
fixsnp = pd.read_table(fixsnppath, dtype=str)
for index, row in fixsnp.iterrows():
if len(rawsnp[(rawsnp['#rsid'] == row['#rsid']) & (rawsnp['genotype'] == row['genotype'])].index.tolist()) > 0:
print("找到对应%s, %s" % (row['#rsid'], row['genotype']))
else:
print("未找到对应%s, %s" % (row['#rsid'], row['genotype']))
parse_change_Record()
import logging
import difflib
logger = logging.getLogger(__name__)
def diff(file_path_1, file_path_2, report_html_path):
txt_line1 = get_lines(file_path_1)
txt_line2 = get_lines(file_path_2)
d = difflib.HtmlDiff()
fid = open(report_html_path, 'w')
fid.write(d.make_file(txt_line1, txt_line2))
fid.close()
def get_lines(file_name):
return open(file_name).readlines()
file_path_1="/Users/joe/Downloads/1.vcf"
file_path_2="/Users/joe/workspace/platform/diff-file/newvcf/1.vcf"
report_html_path="/Users/joe/workspace/platform/diff-file/newvcf/1.html"
diff(file_path_1, file_path_2, report_html_path)