Skip to content

Latest commit

 

History

History
77 lines (50 loc) · 2.32 KB

File metadata and controls

77 lines (50 loc) · 2.32 KB

Reading and Writing of Basic sequence file formats

TODO: needs more examples

FASTA

BioJava can be used to parse large FASTA files. The example below can parse a 1GB (compressed) version of TREMBL with standard memory settings.

    
    
    
     /** Download a large file, e.g. ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz
     * and pass in path to local location of file
     *
     * @param args
     */
        public static void main(String[] args) {

            if ( args.length < 1) {
                System.err.println("First argument needs to be path to fasta file");
                return;
            }

            File f = new File(args[0]);

            if ( ! f.exists()) {
                System.err.println("File does not exist " + args[0]);
                return;
            }

            try {

                // automatically uncompresses files using InputStreamProvider
                InputStreamProvider isp = new InputStreamProvider();
                
                InputStream inStream = isp.getInputStream(f);
                
                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(
                        inStream,
                        new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
                        new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
                
                LinkedHashMap<String, ProteinSequence> b;


                int nrSeq = 0;
                
                while ((b = fastaReader.process(10)) != null) {
                    for (String key : b.keySet()) {
                        nrSeq++;
                        System.out.println(nrSeq + " : " + key + " " + b.get(key));
                    }

                }
            } catch (Exception ex) {
                Logger.getLogger(ParseFastaFileDemo.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

Navigation: Home | Book 1: The Core Module | Chapter 3 : Reading and Writing sequences

Prev: Chapter 2 : Basic Sequence types

Next: Chapter 4 : Translating