Beispiel #1
0
 public void addPredictedGene(PredictedGene gene) {
   root.addContent(gene.asJDomElement());
 }
Beispiel #2
0
  public static void main(String[] args) {

    if (args.length != 2) {
      System.out.println(
          "This program expects two parameters: \n"
              + "1. Input predicted genes XML filename \n"
              + "2. Output GFF filename\n");
    } else {

      String inFileString = args[0];
      String outFileString = args[1];

      File inFile = new File(inFileString);
      File outFile = new File(outFileString);

      try {

        BufferedWriter outBuff = new BufferedWriter(new FileWriter(outFile));

        // writing header first
        outBuff.write(GFF_HEADER + "\n");
        Date currentDate = new Date();
        outBuff.write(DATE_HEADER + currentDate.toString() + "\n");
        outBuff.write(TYPE_HEADER + "\n");

        BufferedReader reader = new BufferedReader(new FileReader(inFile));
        String tempSt;
        StringBuilder stBuilder = new StringBuilder();
        while ((tempSt = reader.readLine()) != null) {
          stBuilder.append(tempSt);
        }
        // closing input file reader
        reader.close();

        Annotation annotation = new Annotation(stBuilder.toString());

        HashMap<String, TreeSet<GffLine>> linesPerContig = new HashMap<String, TreeSet<GffLine>>();

        // -----------POTATIZING GENES----------------
        List<Element> contigsGenes =
            annotation
                .asJDomElement()
                .getChild(PredictedGenes.TAG_NAME)
                .getChildren(ContigXML.TAG_NAME);
        for (Element element : contigsGenes) {
          ContigXML contig = new ContigXML(element);
          TreeSet<GffLine> lines = new TreeSet<GffLine>();
          linesPerContig.put(contig.getId(), lines);
          List<XMLElement> genes = contig.getChildrenWith(PredictedGene.TAG_NAME);
          for (XMLElement xMLElement : genes) {
            PredictedGene gene = new PredictedGene(xMLElement.asJDomElement());

            String geneLine =
                contig.getId() + SEPARATOR + CHORIZO_GEN + SEPARATOR + GENE + SEPARATOR;
            int beginPos = gene.getStartPosition();
            int endPos = gene.getEndPosition();
            int initPos = beginPos;
            if (beginPos < endPos) {
              geneLine += beginPos + SEPARATOR + endPos + SEPARATOR;
            } else {
              geneLine += endPos + SEPARATOR + beginPos + SEPARATOR;
              initPos = endPos;
            }
            geneLine +=
                gene.getEvalue()
                    + SEPARATOR
                    + gene.getStrand()
                    + SEPARATOR
                    + "."
                    + SEPARATOR
                    + LOCUS_TAG
                    + gene.getId()
                    + ";\n";
            lines.add(new GffLine(initPos, geneLine));
            // outBuff.write(geneLine);

            String cdsLine = contig.getId() + SEPARATOR + CHORIZO_GEN + SEPARATOR + CDS + SEPARATOR;
            if (gene.getStrand().equals(PredictedGene.POSITIVE_STRAND)) {
              cdsLine +=
                  gene.getStartPosition() + SEPARATOR + (gene.getEndPosition() - 3) + SEPARATOR;
            } else {
              cdsLine +=
                  (gene.getEndPosition() - 3) + SEPARATOR + gene.getStartPosition() + SEPARATOR;
            }
            cdsLine +=
                gene.getEvalue() + SEPARATOR + gene.getStrand() + SEPARATOR + "0" + SEPARATOR;
            cdsLine +=
                LOCUS_TAG
                    + gene.getId()
                    + ";"
                    + PRODUCT
                    + gene.getProteinNames()
                    + ";"
                    + CHORIZO_INFERENCE
                    + gene.getAccession()
                    + "\n";
            // outBuff.write(cdsLine);
            lines.add(new GffLine(initPos, cdsLine));

            String startCodonLine =
                contig.getId() + SEPARATOR + CHORIZO_GEN + SEPARATOR + START_CODON + SEPARATOR;
            if (gene.getStrand().equals(PredictedGene.POSITIVE_STRAND)) {
              startCodonLine +=
                  gene.getStartPosition() + SEPARATOR + (gene.getStartPosition() + 2) + SEPARATOR;
            } else {
              startCodonLine +=
                  (gene.getStartPosition() - 2) + SEPARATOR + gene.getStartPosition() + SEPARATOR;
            }
            startCodonLine +=
                gene.getEvalue()
                    + SEPARATOR
                    + gene.getStrand()
                    + SEPARATOR
                    + "0"
                    + SEPARATOR
                    + LOCUS_TAG
                    + gene.getId()
                    + ";";
            startCodonLine +=
                PRODUCT
                    + gene.getProteinNames()
                    + ";"
                    + CHORIZO_INFERENCE
                    + gene.getAccession()
                    + "\n";
            // outBuff.write(startCodonLine);
            lines.add(new GffLine(initPos, startCodonLine));

            String stopCodonLine =
                contig.getId() + SEPARATOR + CHORIZO_GEN + SEPARATOR + STOP_CODON + SEPARATOR;
            if (gene.getStrand().equals(PredictedGene.POSITIVE_STRAND)) {
              stopCodonLine +=
                  (gene.getEndPosition() + 1) + SEPARATOR + (gene.getEndPosition() + 3) + SEPARATOR;
            } else {
              stopCodonLine +=
                  (gene.getEndPosition() - 3) + SEPARATOR + (gene.getEndPosition() - 1) + SEPARATOR;
            }
            stopCodonLine +=
                gene.getEvalue()
                    + SEPARATOR
                    + gene.getStrand()
                    + SEPARATOR
                    + "0"
                    + SEPARATOR
                    + LOCUS_TAG
                    + gene.getId()
                    + ";";
            stopCodonLine +=
                PRODUCT
                    + gene.getProteinNames()
                    + ";"
                    + CHORIZO_INFERENCE
                    + gene.getAccession()
                    + "\n";
            // outBuff.write(stopCodonLine);
            lines.add(new GffLine(initPos, stopCodonLine));
          }
        }

        // -----------POTATIZING RNAS-----------------
        List<Element> contigsRnas =
            annotation
                .asJDomElement()
                .getChild(PredictedRnas.TAG_NAME)
                .getChildren(ContigXML.TAG_NAME);
        for (Element element : contigsRnas) {
          ContigXML contig = new ContigXML(element);
          List<XMLElement> rnas = contig.getChildrenWith(PredictedRna.TAG_NAME);

          TreeSet<GffLine> lines = linesPerContig.get(contig.getId());
          if (lines == null) {
            lines = new TreeSet<GffLine>();
            linesPerContig.put(contig.getId(), lines);
          }

          for (XMLElement xMLElement : rnas) {
            PredictedRna rna = new PredictedRna(xMLElement.asJDomElement());

            String rnaLine = contig.getId() + SEPARATOR + CHORIZO_RNA + SEPARATOR + RNA + SEPARATOR;
            int beginPos = rna.getStartPosition();
            int endPos = rna.getEndPosition();
            int initPos = beginPos;
            if (beginPos < endPos) {
              rnaLine += beginPos + SEPARATOR + endPos + SEPARATOR;
            } else {
              rnaLine += endPos + SEPARATOR + beginPos + SEPARATOR;
              initPos = endPos;
            }
            rnaLine +=
                rna.getEvalue()
                    + SEPARATOR
                    + rna.getStrand()
                    + SEPARATOR
                    + "."
                    + SEPARATOR
                    + LOCUS_TAG
                    + rna.getId()
                    + ";";
            String columns[] = rna.getAnnotationUniprotId().split("\\|");
            String rnaProduct = columns[3];
            String refSeqId = columns[1];
            String positions = columns[2].substring(1);
            // ref|NC_007413|:3894075-3895562|16S ribosomal RNA| [locus_tag=Ava_R0035]
            rnaLine +=
                PRODUCT + rnaProduct + "," + "rna:RefSeq:" + refSeqId + " " + positions + "\n";
            // outBuff.write(rnaLine);
            lines.add(new GffLine(initPos, rnaLine));
          }
        }

        Set<String> keys = linesPerContig.keySet();
        for (String key : keys) {
          TreeSet<GffLine> lines = linesPerContig.get(key);
          GffLine line = lines.pollFirst();
          while (line != null) {
            outBuff.write(line.getLine());
            line = lines.pollFirst();
          }
        }

        outBuff.close();
        System.out.println("Done!!! :D");

      } catch (Exception e) {
        e.printStackTrace();
      }
    }
  }