public VepPredictionParser(VCFHeader header, String tag) {
    this.tag = tag;
    VCFInfoHeaderLine info = header.getInfoHeaderLine(tag);
    if (info == null || info.getDescription() == null) {
      System.err.println("NO " + tag + " found in header");
      return;
    }
    String description = info.getDescription();
    String chunck = " Format:";
    int i = description.indexOf(chunck);
    if (i == -1) {
      System.err.println("Cannot find " + chunck + " in " + description);

      return;
    }
    description =
        description.substring(i + chunck.length()).replaceAll("[ \'\\.\\(\\)]+", "").trim();
    String tokens[] = pipe.split(description);
    for (i = 0; i < tokens.length; ++i) {
      if (tokens[i].isEmpty()) continue;
      COLS col = null;
      for (COLS c : COLS.values()) {
        if (c.name().equalsIgnoreCase(tokens[i])) {
          col = c;
        }
      }
      if (col == null) {
        System.err.println("Undefined VEP tag " + tokens[i]);
      }
      col2col.put(col, i);
    }
  }
Ejemplo n.º 2
0
  private void read(InputStream in, String filename) throws IOException {
    // Pattern comma=Pattern.compile("[,]");
    Pattern pipe = Pattern.compile("[\\|]");
    Pattern amp = Pattern.compile("&");

    out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");");
    VcfIterator r = new VcfIterator(in);

    VCFHeader header = r.getHeader();

    String csqColumns[] = null;
    VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ");
    if (infoHeader != null && this.USE_VEP) {
      LOG.info("parsing VEP " + infoHeader.getDescription());
      final String formatStr = "Format: ";
      int i = infoHeader.getDescription().indexOf(formatStr);
      if (i != -1) {
        csqColumns =
            pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim());
        LOG.debug(Arrays.asList(csqColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }
    String snpEffColumns[] = null;
    infoHeader = header.getInfoHeaderLine("EFF");
    if (infoHeader != null && this.USE_SNPEFF) {
      LOG.info("parsing EFF " + infoHeader.getDescription());

      final String formatStr = ".Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      if (i != -1) i = desc.indexOf('(', i + formatStr.length());
      int j = desc.lastIndexOf(')');
      if (i != -1 && j > i) {
        snpEffColumns =
            pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim());
        LOG.info(Arrays.asList(snpEffColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String nmdColumns[] = null;
    infoHeader = header.getInfoHeaderLine("NMD");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        nmdColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String lofColumns[] = null;
    infoHeader = header.getInfoHeaderLine("LOF");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        lofColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    for (String S : header.getSampleNamesInOrder()) {
      // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on
      // SAMPLE.name=vals.y when  NOT MATCHED THEN INSERT VALUES vals.x,vals.y;
      switch (this.engine) {
        case hsql:
          out.println(
              "merge into SAMPLE"
                  + SUFFIX
                  + " using ( values("
                  + quote(S)
                  + ") ) "
                  + "AS vals(y) ON SAMPLE"
                  + SUFFIX
                  + ".name = vals.y "
                  + "WHEN NOT MATCHED THEN INSERT VALUES  (NULL,vals.y);");
          break;
        default:
          out.println(
              "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");");
          break;
      }
    }

    List<String> headers = new ArrayList<String>();

    for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) {
      if (VCFHeaderVersion.isFormatString(line.getKey())) continue;
      headers.add(VCFHeader.METADATA_INDICATOR + line);
    }

    String chromLine = VCFHeader.HEADER_INDICATOR;
    for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
      if (!VCFHeader.HEADER_INDICATOR.equals(chromLine))
        chromLine += (VCFConstants.FIELD_SEPARATOR);
      chromLine += (field);
    }

    if (header.hasGenotypingData()) {
      chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT";
      for (String sample : header.getGenotypeSamples()) {
        chromLine += VCFConstants.FIELD_SEPARATOR;
        chromLine += sample;
      }
    }
    headers.add(chromLine);

    for (String line : headers) {
      out.println(
          "insert into HEADER"
              + SUFFIX
              + "(file_id,header) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(line)
              + ");");
    }

    while (r.hasNext()) {
      VariantContext var = r.next();

      if (var == null) {
        LOG.error("Cannot parse VCF");
        continue;
      }
      // "create table if not exists FILE(id,filename text)";
      // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)";

      out.println(
          "insert into VARIATION"
              + SUFFIX
              + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(var.getChr())
              + ","
              + var.getStart()
              + ","
              + (var.getStart() - 1)
              + ","
              + var.getEnd()
              + ","
              + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD)
                  ? "NULL"
                  : quote(var.getID()))
              + ","
              + quote(var.getReference().getDisplayString())
              + ","
              + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual())
              + ");");
      // "create table if not exists ALT(id,var_id,alt)";

      for (Allele alt : var.getAlternateAlleles()) {
        out.println(
            "insert into ALT"
                + SUFFIX
                + "(var_id,alt) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(alt.getDisplayString())
                + ");");
      }
      // "create table if not exists FILTER(id,var_id,filter)";

      for (String filter : var.getFilters()) {
        out.println(
            "insert into FILTER"
                + SUFFIX
                + "(var_id,filter) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(filter)
                + ");");
      }
      CommonInfo infos = var.getCommonInfo();
      for (String key : infos.getAttributes().keySet()) {
        Object val = infos.getAttribute(key);
        // "create table if not exists INFO(id,var_id,k,v)";

        if (SPLIT4 && key.equals("DP4")) {
          String dp4[] = infotoString(val).split("[,]");
          insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0]));
          insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1]));
          insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2]));
          insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3]));
        } else {
          insertIntoInfo(quote(key), quote(infotoString(val)));
        }

        if (key.equals("CSQ") && csqColumns != null) {
          List as_array = castToStringArray(val);

          for (Object csqs : as_array) {
            if (csqs.toString().isEmpty()) continue;
            String tokens[] = pipe.split(csqs.toString());
            List<String> extraInfo = new ArrayList<String>();
            for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              if (csqColumns[t].equals("Consequence")) {
                for (String pred : amp.split(tokens[t])) {
                  if (pred.isEmpty()) continue;
                  extraInfo.add(csqColumns[t]);
                  extraInfo.add(pred);
                }

              } else {
                extraInfo.add(csqColumns[t]);
                extraInfo.add(tokens[t]);
              }
            }
            insertExtraInfos("CSQ", extraInfo);
          }
        }

        if (key.equals("EFF") && snpEffColumns != null) {
          for (Object item : castToStringArray(val)) {
            String snpeff = item.toString();
            if (snpeff.isEmpty()) continue;
            int opar = snpeff.indexOf('(');
            if (opar == -1) continue;
            int cpar = snpeff.lastIndexOf(')');
            if (cpar == -1) continue;
            String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar));
            List<String> h = new ArrayList<String>();
            h.add("Effect");
            h.add(snpeff.substring(0, opar));
            for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(snpEffColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("NMD") && nmdColumns != null) {

          for (Object item : castToStringArray(val)) {
            String nmd = item.toString();
            if (nmd.isEmpty()) continue;
            String tokens[] = pipe.split(nmd);
            List<String> h = new ArrayList<String>(nmdColumns.length * 2);
            for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(nmdColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("LOF") && lofColumns != null) {

          for (Object item : castToStringArray(val)) {
            String lof = item.toString();
            if (lof.isEmpty()) continue;
            String tokens[] = pipe.split(lof);
            List<String> h = new ArrayList<String>(lofColumns.length * 2);
            for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(lofColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }
      }
      GenotypesContext genotypesCtx = var.getGenotypes();
      for (Genotype g : genotypesCtx) {
        // "create table if not exists GENOTYPE(id,var_id,k,v)";

        List<Allele> alleles = g.getAlleles();

        out.println(
            "insert into GENOTYPE"
                + SUFFIX
                + "(var_id,sample_id,A1,A2,dp,ad,gq,pl,"
                + "is_phased,is_hom,is_homref,is_homvar,is_mixed,"
                + "is_nocall,is_noninformative,is_available,is_called,is_filtered"
                + ") values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + "(select id from SAMPLE"
                + SUFFIX
                + " where name="
                + quote(g.getSampleName())
                + "),"
                + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL")
                + ","
                + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL")
                + ","
                + (g.hasDP() ? g.getDP() : "NULL")
                + ","
                + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL")
                + ","
                + (g.hasGQ() ? g.getGQ() : "NULL")
                + ","
                + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL")
                + ","
                + (g.isPhased() ? 1 : 0)
                + ","
                + (g.isHom() ? 1 : 0)
                + ","
                + (g.isHomRef() ? 1 : 0)
                + ","
                + (g.isHomVar() ? 1 : 0)
                + ","
                + (g.isMixed() ? 1 : 0)
                + ","
                + (g.isNoCall() ? 1 : 0)
                + ","
                + (g.isNonInformative() ? 1 : 0)
                + ","
                + (g.isAvailable() ? 1 : 0)
                + ","
                + (g.isCalled() ? 1 : 0)
                + ","
                + (g.isFiltered() ? 1 : 0)
                + ");");

        for (String key : g.getExtendedAttributes().keySet()) {
          Object val = g.getExtendedAttribute(key);
          if (val == null) continue;
          out.println(
              "insert into GTPROP"
                  + SUFFIX
                  + "(genotype_id,k,v) values ("
                  + "(select max(id) from GENOTYPE"
                  + SUFFIX
                  + "),"
                  + quote(key)
                  + ","
                  + quote(infotoString(val))
                  + ");");
        }
      }
    }
    r.close();
  }