/** @throws IOException */
  private void readFileHeader() throws Exception {

    // fisrt four bytes are empty
    // it should be zero for new version of control file, backward compatibility
    int emptyBytes = this.readFourBytes(inputStream);
    if (emptyBytes != 0) {

      log.warn(
          "The first four bytes are not zero: "
              + emptyBytes
              + ". This is an old format control file.");
      this.totalClusters = emptyBytes;
      return;
    }

    // next four bytes should be version and greater or equal to the expected
    int version = this.readFourBytes(inputStream);
    if (version != this.EXPECTED_CONTROL_VERSION) {
      log.error("Unexpected version byte: " + version);
      throw new Exception("Unexpected version number in control file");
    }

    // next four bytes should be the total number of clusters
    this.totalClusters = this.readFourBytes(inputStream);
    log.info("The total number of clusters: " + this.getTotalClusters());
  }
  @Override
  public Object next() {

    try {
      int nextByte = this.inputStream.readUnsignedShort();

      if (nextByte == -1) {
        log.warn(
            "There is no more cluster in Control file after cluster "
                + this.getCurrentCluster()
                + " in file "
                + this.getFileName());
        return null;
      }

      this.currentCluster++;
      /*
      Bit0: always empty (0)
      Bit1: was the read identified as a control?
      Bit2: was the match ambiguous?
      Bit3: did the read match the phiX tag?
      Bit4: did the read align to match the phiX tag?
      Bit5: did the read match the control index sequence? (specified in controls.fata, TGTCACA)
      Bits6,7: reserved for future use
      Bits8..15: the report key for the matched record in the controls.fasta file (specified by the REPOControl FilesRT_ KEY metadata)
      */
      nextByte = nextByte & 0x2;
      if (nextByte != 0) {
        this.currentControlClusters++;
      }

      return new Integer(nextByte);

    } catch (IOException ex) {
      log.error(ex, "Problem to read control file");
    }

    return null;
  }
Beispiel #3
0
  private void read(InputStream in, String filename) throws IOException {
    // Pattern comma=Pattern.compile("[,]");
    Pattern pipe = Pattern.compile("[\\|]");
    Pattern amp = Pattern.compile("&");

    out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");");
    VcfIterator r = new VcfIterator(in);

    VCFHeader header = r.getHeader();

    String csqColumns[] = null;
    VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ");
    if (infoHeader != null && this.USE_VEP) {
      LOG.info("parsing VEP " + infoHeader.getDescription());
      final String formatStr = "Format: ";
      int i = infoHeader.getDescription().indexOf(formatStr);
      if (i != -1) {
        csqColumns =
            pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim());
        LOG.debug(Arrays.asList(csqColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }
    String snpEffColumns[] = null;
    infoHeader = header.getInfoHeaderLine("EFF");
    if (infoHeader != null && this.USE_SNPEFF) {
      LOG.info("parsing EFF " + infoHeader.getDescription());

      final String formatStr = ".Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      if (i != -1) i = desc.indexOf('(', i + formatStr.length());
      int j = desc.lastIndexOf(')');
      if (i != -1 && j > i) {
        snpEffColumns =
            pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim());
        LOG.info(Arrays.asList(snpEffColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String nmdColumns[] = null;
    infoHeader = header.getInfoHeaderLine("NMD");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        nmdColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String lofColumns[] = null;
    infoHeader = header.getInfoHeaderLine("LOF");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        lofColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    for (String S : header.getSampleNamesInOrder()) {
      // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on
      // SAMPLE.name=vals.y when  NOT MATCHED THEN INSERT VALUES vals.x,vals.y;
      switch (this.engine) {
        case hsql:
          out.println(
              "merge into SAMPLE"
                  + SUFFIX
                  + " using ( values("
                  + quote(S)
                  + ") ) "
                  + "AS vals(y) ON SAMPLE"
                  + SUFFIX
                  + ".name = vals.y "
                  + "WHEN NOT MATCHED THEN INSERT VALUES  (NULL,vals.y);");
          break;
        default:
          out.println(
              "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");");
          break;
      }
    }

    List<String> headers = new ArrayList<String>();

    for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) {
      if (VCFHeaderVersion.isFormatString(line.getKey())) continue;
      headers.add(VCFHeader.METADATA_INDICATOR + line);
    }

    String chromLine = VCFHeader.HEADER_INDICATOR;
    for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
      if (!VCFHeader.HEADER_INDICATOR.equals(chromLine))
        chromLine += (VCFConstants.FIELD_SEPARATOR);
      chromLine += (field);
    }

    if (header.hasGenotypingData()) {
      chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT";
      for (String sample : header.getGenotypeSamples()) {
        chromLine += VCFConstants.FIELD_SEPARATOR;
        chromLine += sample;
      }
    }
    headers.add(chromLine);

    for (String line : headers) {
      out.println(
          "insert into HEADER"
              + SUFFIX
              + "(file_id,header) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(line)
              + ");");
    }

    while (r.hasNext()) {
      VariantContext var = r.next();

      if (var == null) {
        LOG.error("Cannot parse VCF");
        continue;
      }
      // "create table if not exists FILE(id,filename text)";
      // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)";

      out.println(
          "insert into VARIATION"
              + SUFFIX
              + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(var.getChr())
              + ","
              + var.getStart()
              + ","
              + (var.getStart() - 1)
              + ","
              + var.getEnd()
              + ","
              + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD)
                  ? "NULL"
                  : quote(var.getID()))
              + ","
              + quote(var.getReference().getDisplayString())
              + ","
              + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual())
              + ");");
      // "create table if not exists ALT(id,var_id,alt)";

      for (Allele alt : var.getAlternateAlleles()) {
        out.println(
            "insert into ALT"
                + SUFFIX
                + "(var_id,alt) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(alt.getDisplayString())
                + ");");
      }
      // "create table if not exists FILTER(id,var_id,filter)";

      for (String filter : var.getFilters()) {
        out.println(
            "insert into FILTER"
                + SUFFIX
                + "(var_id,filter) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(filter)
                + ");");
      }
      CommonInfo infos = var.getCommonInfo();
      for (String key : infos.getAttributes().keySet()) {
        Object val = infos.getAttribute(key);
        // "create table if not exists INFO(id,var_id,k,v)";

        if (SPLIT4 && key.equals("DP4")) {
          String dp4[] = infotoString(val).split("[,]");
          insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0]));
          insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1]));
          insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2]));
          insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3]));
        } else {
          insertIntoInfo(quote(key), quote(infotoString(val)));
        }

        if (key.equals("CSQ") && csqColumns != null) {
          List as_array = castToStringArray(val);

          for (Object csqs : as_array) {
            if (csqs.toString().isEmpty()) continue;
            String tokens[] = pipe.split(csqs.toString());
            List<String> extraInfo = new ArrayList<String>();
            for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              if (csqColumns[t].equals("Consequence")) {
                for (String pred : amp.split(tokens[t])) {
                  if (pred.isEmpty()) continue;
                  extraInfo.add(csqColumns[t]);
                  extraInfo.add(pred);
                }

              } else {
                extraInfo.add(csqColumns[t]);
                extraInfo.add(tokens[t]);
              }
            }
            insertExtraInfos("CSQ", extraInfo);
          }
        }

        if (key.equals("EFF") && snpEffColumns != null) {
          for (Object item : castToStringArray(val)) {
            String snpeff = item.toString();
            if (snpeff.isEmpty()) continue;
            int opar = snpeff.indexOf('(');
            if (opar == -1) continue;
            int cpar = snpeff.lastIndexOf(')');
            if (cpar == -1) continue;
            String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar));
            List<String> h = new ArrayList<String>();
            h.add("Effect");
            h.add(snpeff.substring(0, opar));
            for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(snpEffColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("NMD") && nmdColumns != null) {

          for (Object item : castToStringArray(val)) {
            String nmd = item.toString();
            if (nmd.isEmpty()) continue;
            String tokens[] = pipe.split(nmd);
            List<String> h = new ArrayList<String>(nmdColumns.length * 2);
            for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(nmdColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("LOF") && lofColumns != null) {

          for (Object item : castToStringArray(val)) {
            String lof = item.toString();
            if (lof.isEmpty()) continue;
            String tokens[] = pipe.split(lof);
            List<String> h = new ArrayList<String>(lofColumns.length * 2);
            for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(lofColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }
      }
      GenotypesContext genotypesCtx = var.getGenotypes();
      for (Genotype g : genotypesCtx) {
        // "create table if not exists GENOTYPE(id,var_id,k,v)";

        List<Allele> alleles = g.getAlleles();

        out.println(
            "insert into GENOTYPE"
                + SUFFIX
                + "(var_id,sample_id,A1,A2,dp,ad,gq,pl,"
                + "is_phased,is_hom,is_homref,is_homvar,is_mixed,"
                + "is_nocall,is_noninformative,is_available,is_called,is_filtered"
                + ") values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + "(select id from SAMPLE"
                + SUFFIX
                + " where name="
                + quote(g.getSampleName())
                + "),"
                + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL")
                + ","
                + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL")
                + ","
                + (g.hasDP() ? g.getDP() : "NULL")
                + ","
                + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL")
                + ","
                + (g.hasGQ() ? g.getGQ() : "NULL")
                + ","
                + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL")
                + ","
                + (g.isPhased() ? 1 : 0)
                + ","
                + (g.isHom() ? 1 : 0)
                + ","
                + (g.isHomRef() ? 1 : 0)
                + ","
                + (g.isHomVar() ? 1 : 0)
                + ","
                + (g.isMixed() ? 1 : 0)
                + ","
                + (g.isNoCall() ? 1 : 0)
                + ","
                + (g.isNonInformative() ? 1 : 0)
                + ","
                + (g.isAvailable() ? 1 : 0)
                + ","
                + (g.isCalled() ? 1 : 0)
                + ","
                + (g.isFiltered() ? 1 : 0)
                + ");");

        for (String key : g.getExtendedAttributes().keySet()) {
          Object val = g.getExtendedAttribute(key);
          if (val == null) continue;
          out.println(
              "insert into GTPROP"
                  + SUFFIX
                  + "(genotype_id,k,v) values ("
                  + "(select max(id) from GENOTYPE"
                  + SUFFIX
                  + "),"
                  + quote(key)
                  + ","
                  + quote(infotoString(val))
                  + ");");
        }
      }
    }
    r.close();
  }
Beispiel #4
0
  @Override
  protected int doWork() {
    try {
      try {
        this.engine = SQLEngine.valueOf(this.ENGINE);
      } catch (Exception err) {
        LOG.error("BAD SQL ENGINE " + this.ENGINE);
        return -1;
      }
      out.println(
          "create table if not exists FILE"
              + SUFFIX
              + "("
              + columnId()
              + "filename "
              + varchar(255)
              + " NOT NULL"
              + ");");

      out.println(
          "create table if not exists HEADER"
              + SUFFIX
              + "("
              + columnId()
              + "file_id INT NOT NULL REFERENCES FILE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "header "
              + text()
              + ");");

      out.println(
          "create table if not exists SAMPLE"
              + SUFFIX
              + "("
              + columnId()
              + "name "
              + varchar(100)
              + " NOT NULL UNIQUE"
              + ");");
      out.println(
          "create table if not exists VARIATION"
              + SUFFIX
              + "("
              + columnId()
              + "file_id INT NOT NULL REFERENCES FILE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "CHROM VARCHAR(20) NOT NULL,"
              + "POS INT NOT NULL,"
              + "START0 INT NOT NULL,"
              + "END0 INT NOT NULL,"
              + "RS_ID VARCHAR(50),"
              + "REF "
              + text()
              + " NOT NULL,"
              + "QUAL FLOAT"
              + ");");

      out.println(
          "create table if not exists ALT"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "ALT "
              + text()
              + ");");
      out.println(
          "create table if not exists FILTER"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "FILTER varchar(50) not null"
              + ");");

      out.println(
          "create table if not exists INFO"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");

      out.println(
          "create table if not exists EXTRAINFO"
              + SUFFIX
              + "("
              + columnId()
              + "info_id INT NOT NULL REFERENCES INFO"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "type varchar(50) not null"
              + ");");

      out.println(
          "create table if not exists EXTRAINFOPROP"
              + SUFFIX
              + "("
              + columnId()
              + "extrainfo_id INT NOT NULL REFERENCES EXTRAINFO"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");

      out.println(
          "create table if not exists GENOTYPE"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "sample_id INT NOT NULL REFERENCES SAMPLE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "A1 "
              + text()
              + ", A2 "
              + text()
              + ", dp int, ad varchar(50), gq float,pl "
              + text()
              + ","
              + "is_phased SMALLINT not null,is_hom SMALLINT not null,is_homref  SMALLINT not null,is_homvar  SMALLINT not null,is_mixed  SMALLINT not null,"
              + "is_nocall SMALLINT not null,is_noninformative SMALLINT not null,is_available SMALLINT not null,is_called SMALLINT not null,is_filtered  SMALLINT not null"
              + ");");
      out.println(
          "create table if not exists GTPROP"
              + SUFFIX
              + "("
              + columnId()
              + "genotype_id INT NOT NULL REFERENCES GENOTYPE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");
      switch (this.engine) {
        case sqlite:
          out.println("begin transaction;");
          break;
        default:
          break;
      }

      if (IN.isEmpty()) {
        LOG.info("reading from stdin");
        read(System.in, "<stdin>");
      } else {
        for (File input : IN) {
          LOG.info("opening " + input);
          InputStream in = IOUtils.openFileForReading(input);
          read(in, input.toString());
          in.close();
        }
      }
      if (SQLINDEX) {
        index("SAMPLE", "name");
        index("EXTRAINFO", "type");
        index("EXTRAINFOPROP", "k");
        index("EXTRAINFOPROP", "v");

        index("INFO", "var_id");
        index("INFO", "k");
        index("EXTRAINFO", "info_id");
        index("EXTRAINFOPROP", "extrainfo_id");
        index("GENOTYPE", "var_id");
        index("GENOTYPE", "sample_id");
      }
      switch (this.engine) {
        case sqlite:
          out.println("commit;");
          break;
        default:
          break;
      }

      out.flush();
    } catch (IOException err) {
      err.printStackTrace();
      return -1;
    }
    return 0;
  }