Bioinfo~~~
GenBank filetype to Fasta filetype
.gb -> .fasta
public class G2F {
private String LOCUS = "LOCUS ", LOCUS_S = "begin", //"SOURCE(name)"+"." +"LOCUS"
ORGANISM = " ORGANISM ", ORGANISM_S = "begin",
ACCESSION = "ACCESSION ", ACCESSION_S = "begin",
VERSION = "VERSION ", VERSION_S = "begin",
DEFINITION = "DEFINITION ", DEFINITION_S = "begin",
ORIGIN = "ORIGIN ", ORIGIN_S = "begin",
END_DATA = "//";
String short_name = null, accession_name = null,
version_name = null, definition_name = null,
organism_name = null, warning_mess = null,
origin = null,
genbank_name = null,
sequence = "",
firstline = null, secondline =null;
public void resetState()
{
LOCUS_S = "begin";
ORGANISM_S = "begin";
ACCESSION_S = "begin";
VERSION_S = "begin";
DEFINITION_S = "begin";
ORIGIN_S = "begin";
}
public void resetName()
{
short_name = null;
accession_name = null;
version_name = null;
definition_name = null;
organism_name = null;
warning_mess = null;
genbank_name = null;
origin = null;
sequence = "";
firstline = null;
secondline =null;
}
public void scan1squence(String inputfile, String outputfile) throws IOException
{
BufferedReader in = new BufferedReader(new FileReader(inputfile));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile)));
String sline = null;
/*String firstline = "";
String secondline = "";*/
while((sline = in.readLine()) != null){
if( ! sline.equals(END_DATA) && ! sline.equals("")){
String stemp = sline.substring(0, 12);
String resub = sline.substring(12);
//System.out.println(stemp);
if(stemp.equals(ORGANISM)){
organism_name = resub;
String s[] = resub.split(" ");
if(s[1].length() < 3){
s[1] = "XXX";
}
short_name = s[0].substring(0, 4)+ "_" + s[1].substring(0,4);
ORGANISM_S = "done";
}
if(stemp.equals(ACCESSION)){
accession_name = resub;
ACCESSION_S = "done";
}
if(stemp.equals(VERSION)){
String s[] = resub.split("GI:");
version_name = s[0].trim();
genbank_name = s[1].trim();
//System.out.println(version_name);
VERSION_S = "done";
}
if(stemp.equals(DEFINITION)){
definition_name = resub;
DEFINITION_S = "done";
}
if(stemp.equals(ORIGIN)){
ORIGIN_S = "done";
//sline = in.readLine();
}
if( ORIGIN_S.equals("done") ){
while( ! (sline = in.readLine()).equals(END_DATA) ){
String tempsequence = sline.substring(10);
sequence += tempsequence.replace(" ","").toUpperCase();
}
ORIGIN_S = "nextS";
//System.out.println(sequence);
}
if(ORGANISM_S.equals("done") && ACCESSION_S.equals("done")
&& VERSION_S.equals("done") && DEFINITION_S.equals("done")
&& ORIGIN_S.equals("nextS"))
{
firstline = ">"+short_name+"."+accession_name+
" ( "+version_name+" GI:"+genbank_name+" ) { "+definition_name+
" } [ "+organism_name+" ]";
/*if(firstline.equals(">Cich_endi.EL372564 ( EL372564.1 GI:125358052 ) { CCEL5375.b1_N24.ab1 CCE(LMS) endive Cichorium endivia cDNA clone } [ Cichorium endivia ]"))
System.out.println("----------true-----------");*/
secondline = sequence;
/*System.out.println(firstline);
System.out.println(secondline);
if(secondline.equals("TATTCCAGAATCTCACACCTTTTTACACTAGCAGAAAGCCAGAAACACAGACCAACAACAGACGAGGAGGCACGAATTCCAACACAGAAAGGTTTTGTCTTCTCTTTCAACATCAAAGAGGGCTCTAGAAGCCCCCTGAGACCAAATCTTCAAACCAACATGGAATACCAAGCAAACTATTCAATTTGGGATGGTTTATACTACCATCCACACCTATTCGGTGGCATTATGCTAACAGTTGCATTGCTTGGTCTTTCCACAAGCTATCTAAGTGGCATAGCTGGCTTCCCTACTTTACCCTACATGTTACCTTATTTAGGAAACTTCCAGAAACAAAAAACCAACAAGAAACGTATCCGTGTGTACATGGATGGATGTTTTGATCTCATGCATTATGGTCACGCAAATGCTTTAAGACAAGCTAAAGCTTTAGGAGACGAATTAGTGGTTGGAATTGTAAGTGATGAAGAAATCATCAAGAACAAAGGTCCTCCTGTTTTATCAATGGAGGAAAGATTGGCACTTGTTAGTGGATTGAAGTGGGTTGATGAAGTTATTGCTAATGCACCTTATGCTATTACTGAAGACTTCATGAACAGTCTATTTAAAGAACATAAGATTGATTATATCATTCATGGAGATGATCCTTGTTTGCTTCCTGATGGAAGTGATGCATATGCTTTAGCAAAAAAAAAGTTGGTCGTTACAA"))
System.out.println("----------true-----------");*/
out.println(firstline);
out.println(secondline);
resetName();
resetState();
continue;
}
}
else if(sline.equals(END_DATA)){
//resetName();
resetState();
}
}
out.close();
in.close();
}
}
public class G2F { private String LOCUS = "LOCUS ", LOCUS_S = "begin", //"SOURCE(name)"+"." +"LOCUS" ORGANISM = " ORGANISM ", ORGANISM_S = "begin", ACCESSION = "ACCESSION ", ACCESSION_S = "begin", VERSION = "VERSION ", VERSION_S = "begin", DEFINITION = "DEFINITION ", DEFINITION_S = "begin", ORIGIN = "ORIGIN ", ORIGIN_S = "begin", END_DATA = "//"; String short_name = null, accession_name = null, version_name = null, definition_name = null, organism_name = null, warning_mess = null, origin = null, genbank_name = null, sequence = "", firstline = null, secondline =null; public void resetState() { LOCUS_S = "begin"; ORGANISM_S = "begin"; ACCESSION_S = "begin"; VERSION_S = "begin"; DEFINITION_S = "begin"; ORIGIN_S = "begin"; } public void resetName() { short_name = null; accession_name = null; version_name = null; definition_name = null; organism_name = null; warning_mess = null; genbank_name = null; origin = null; sequence = ""; firstline = null; secondline =null; } public void scan1squence(String inputfile, String outputfile) throws IOException { BufferedReader in = new BufferedReader(new FileReader(inputfile)); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile))); String sline = null; /*String firstline = ""; String secondline = "";*/ while((sline = in.readLine()) != null){ if( ! sline.equals(END_DATA) && ! sline.equals("")){ String stemp = sline.substring(0, 12); String resub = sline.substring(12); //System.out.println(stemp); if(stemp.equals(ORGANISM)){ organism_name = resub; String s[] = resub.split(" "); if(s[1].length() < 3){ s[1] = "XXX"; } short_name = s[0].substring(0, 4)+ "_" + s[1].substring(0,4); ORGANISM_S = "done"; } if(stemp.equals(ACCESSION)){ accession_name = resub; ACCESSION_S = "done"; } if(stemp.equals(VERSION)){ String s[] = resub.split("GI:"); version_name = s[0].trim(); genbank_name = s[1].trim(); //System.out.println(version_name); VERSION_S = "done"; } if(stemp.equals(DEFINITION)){ definition_name = resub; DEFINITION_S = "done"; } if(stemp.equals(ORIGIN)){ ORIGIN_S = "done"; //sline = in.readLine(); } if( ORIGIN_S.equals("done") ){ while( ! (sline = in.readLine()).equals(END_DATA) ){ String tempsequence = sline.substring(10); sequence += tempsequence.replace(" ","").toUpperCase(); } ORIGIN_S = "nextS"; //System.out.println(sequence); } if(ORGANISM_S.equals("done") && ACCESSION_S.equals("done") && VERSION_S.equals("done") && DEFINITION_S.equals("done") && ORIGIN_S.equals("nextS")) { firstline = ">"+short_name+"."+accession_name+ " ( "+version_name+" GI:"+genbank_name+" ) { "+definition_name+ " } [ "+organism_name+" ]"; /*if(firstline.equals(">Cich_endi.EL372564 ( EL372564.1 GI:125358052 ) { CCEL5375.b1_N24.ab1 CCE(LMS) endive Cichorium endivia cDNA clone } [ Cichorium endivia ]")) System.out.println("----------true-----------");*/ secondline = sequence; /*System.out.println(firstline); System.out.println(secondline); if(secondline.equals("TATTCCAGAATCTCACACCTTTTTACACTAGCAGAAAGCCAGAAACACAGACCAACAACAGACGAGGAGGCACGAATTCCAACACAGAAAGGTTTTGTCTTCTCTTTCAACATCAAAGAGGGCTCTAGAAGCCCCCTGAGACCAAATCTTCAAACCAACATGGAATACCAAGCAAACTATTCAATTTGGGATGGTTTATACTACCATCCACACCTATTCGGTGGCATTATGCTAACAGTTGCATTGCTTGGTCTTTCCACAAGCTATCTAAGTGGCATAGCTGGCTTCCCTACTTTACCCTACATGTTACCTTATTTAGGAAACTTCCAGAAACAAAAAACCAACAAGAAACGTATCCGTGTGTACATGGATGGATGTTTTGATCTCATGCATTATGGTCACGCAAATGCTTTAAGACAAGCTAAAGCTTTAGGAGACGAATTAGTGGTTGGAATTGTAAGTGATGAAGAAATCATCAAGAACAAAGGTCCTCCTGTTTTATCAATGGAGGAAAGATTGGCACTTGTTAGTGGATTGAAGTGGGTTGATGAAGTTATTGCTAATGCACCTTATGCTATTACTGAAGACTTCATGAACAGTCTATTTAAAGAACATAAGATTGATTATATCATTCATGGAGATGATCCTTGTTTGCTTCCTGATGGAAGTGATGCATATGCTTTAGCAAAAAAAAAGTTGGTCGTTACAA")) System.out.println("----------true-----------");*/ out.println(firstline); out.println(secondline); resetName(); resetState(); continue; } } else if(sline.equals(END_DATA)){ //resetName(); resetState(); } } out.close(); in.close(); } }