Beispiel #1
0
  private void read(InputStream in, String filename) throws IOException {
    // Pattern comma=Pattern.compile("[,]");
    Pattern pipe = Pattern.compile("[\\|]");
    Pattern amp = Pattern.compile("&");

    out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");");
    VcfIterator r = new VcfIterator(in);

    VCFHeader header = r.getHeader();

    String csqColumns[] = null;
    VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ");
    if (infoHeader != null && this.USE_VEP) {
      LOG.info("parsing VEP " + infoHeader.getDescription());
      final String formatStr = "Format: ";
      int i = infoHeader.getDescription().indexOf(formatStr);
      if (i != -1) {
        csqColumns =
            pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim());
        LOG.debug(Arrays.asList(csqColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }
    String snpEffColumns[] = null;
    infoHeader = header.getInfoHeaderLine("EFF");
    if (infoHeader != null && this.USE_SNPEFF) {
      LOG.info("parsing EFF " + infoHeader.getDescription());

      final String formatStr = ".Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      if (i != -1) i = desc.indexOf('(', i + formatStr.length());
      int j = desc.lastIndexOf(')');
      if (i != -1 && j > i) {
        snpEffColumns =
            pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim());
        LOG.info(Arrays.asList(snpEffColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String nmdColumns[] = null;
    infoHeader = header.getInfoHeaderLine("NMD");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        nmdColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String lofColumns[] = null;
    infoHeader = header.getInfoHeaderLine("LOF");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        lofColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    for (String S : header.getSampleNamesInOrder()) {
      // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on
      // SAMPLE.name=vals.y when  NOT MATCHED THEN INSERT VALUES vals.x,vals.y;
      switch (this.engine) {
        case hsql:
          out.println(
              "merge into SAMPLE"
                  + SUFFIX
                  + " using ( values("
                  + quote(S)
                  + ") ) "
                  + "AS vals(y) ON SAMPLE"
                  + SUFFIX
                  + ".name = vals.y "
                  + "WHEN NOT MATCHED THEN INSERT VALUES  (NULL,vals.y);");
          break;
        default:
          out.println(
              "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");");
          break;
      }
    }

    List<String> headers = new ArrayList<String>();

    for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) {
      if (VCFHeaderVersion.isFormatString(line.getKey())) continue;
      headers.add(VCFHeader.METADATA_INDICATOR + line);
    }

    String chromLine = VCFHeader.HEADER_INDICATOR;
    for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
      if (!VCFHeader.HEADER_INDICATOR.equals(chromLine))
        chromLine += (VCFConstants.FIELD_SEPARATOR);
      chromLine += (field);
    }

    if (header.hasGenotypingData()) {
      chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT";
      for (String sample : header.getGenotypeSamples()) {
        chromLine += VCFConstants.FIELD_SEPARATOR;
        chromLine += sample;
      }
    }
    headers.add(chromLine);

    for (String line : headers) {
      out.println(
          "insert into HEADER"
              + SUFFIX
              + "(file_id,header) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(line)
              + ");");
    }

    while (r.hasNext()) {
      VariantContext var = r.next();

      if (var == null) {
        LOG.error("Cannot parse VCF");
        continue;
      }
      // "create table if not exists FILE(id,filename text)";
      // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)";

      out.println(
          "insert into VARIATION"
              + SUFFIX
              + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(var.getChr())
              + ","
              + var.getStart()
              + ","
              + (var.getStart() - 1)
              + ","
              + var.getEnd()
              + ","
              + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD)
                  ? "NULL"
                  : quote(var.getID()))
              + ","
              + quote(var.getReference().getDisplayString())
              + ","
              + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual())
              + ");");
      // "create table if not exists ALT(id,var_id,alt)";

      for (Allele alt : var.getAlternateAlleles()) {
        out.println(
            "insert into ALT"
                + SUFFIX
                + "(var_id,alt) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(alt.getDisplayString())
                + ");");
      }
      // "create table if not exists FILTER(id,var_id,filter)";

      for (String filter : var.getFilters()) {
        out.println(
            "insert into FILTER"
                + SUFFIX
                + "(var_id,filter) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(filter)
                + ");");
      }
      CommonInfo infos = var.getCommonInfo();
      for (String key : infos.getAttributes().keySet()) {
        Object val = infos.getAttribute(key);
        // "create table if not exists INFO(id,var_id,k,v)";

        if (SPLIT4 && key.equals("DP4")) {
          String dp4[] = infotoString(val).split("[,]");
          insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0]));
          insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1]));
          insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2]));
          insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3]));
        } else {
          insertIntoInfo(quote(key), quote(infotoString(val)));
        }

        if (key.equals("CSQ") && csqColumns != null) {
          List as_array = castToStringArray(val);

          for (Object csqs : as_array) {
            if (csqs.toString().isEmpty()) continue;
            String tokens[] = pipe.split(csqs.toString());
            List<String> extraInfo = new ArrayList<String>();
            for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              if (csqColumns[t].equals("Consequence")) {
                for (String pred : amp.split(tokens[t])) {
                  if (pred.isEmpty()) continue;
                  extraInfo.add(csqColumns[t]);
                  extraInfo.add(pred);
                }

              } else {
                extraInfo.add(csqColumns[t]);
                extraInfo.add(tokens[t]);
              }
            }
            insertExtraInfos("CSQ", extraInfo);
          }
        }

        if (key.equals("EFF") && snpEffColumns != null) {
          for (Object item : castToStringArray(val)) {
            String snpeff = item.toString();
            if (snpeff.isEmpty()) continue;
            int opar = snpeff.indexOf('(');
            if (opar == -1) continue;
            int cpar = snpeff.lastIndexOf(')');
            if (cpar == -1) continue;
            String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar));
            List<String> h = new ArrayList<String>();
            h.add("Effect");
            h.add(snpeff.substring(0, opar));
            for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(snpEffColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("NMD") && nmdColumns != null) {

          for (Object item : castToStringArray(val)) {
            String nmd = item.toString();
            if (nmd.isEmpty()) continue;
            String tokens[] = pipe.split(nmd);
            List<String> h = new ArrayList<String>(nmdColumns.length * 2);
            for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(nmdColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("LOF") && lofColumns != null) {

          for (Object item : castToStringArray(val)) {
            String lof = item.toString();
            if (lof.isEmpty()) continue;
            String tokens[] = pipe.split(lof);
            List<String> h = new ArrayList<String>(lofColumns.length * 2);
            for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(lofColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }
      }
      GenotypesContext genotypesCtx = var.getGenotypes();
      for (Genotype g : genotypesCtx) {
        // "create table if not exists GENOTYPE(id,var_id,k,v)";

        List<Allele> alleles = g.getAlleles();

        out.println(
            "insert into GENOTYPE"
                + SUFFIX
                + "(var_id,sample_id,A1,A2,dp,ad,gq,pl,"
                + "is_phased,is_hom,is_homref,is_homvar,is_mixed,"
                + "is_nocall,is_noninformative,is_available,is_called,is_filtered"
                + ") values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + "(select id from SAMPLE"
                + SUFFIX
                + " where name="
                + quote(g.getSampleName())
                + "),"
                + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL")
                + ","
                + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL")
                + ","
                + (g.hasDP() ? g.getDP() : "NULL")
                + ","
                + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL")
                + ","
                + (g.hasGQ() ? g.getGQ() : "NULL")
                + ","
                + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL")
                + ","
                + (g.isPhased() ? 1 : 0)
                + ","
                + (g.isHom() ? 1 : 0)
                + ","
                + (g.isHomRef() ? 1 : 0)
                + ","
                + (g.isHomVar() ? 1 : 0)
                + ","
                + (g.isMixed() ? 1 : 0)
                + ","
                + (g.isNoCall() ? 1 : 0)
                + ","
                + (g.isNonInformative() ? 1 : 0)
                + ","
                + (g.isAvailable() ? 1 : 0)
                + ","
                + (g.isCalled() ? 1 : 0)
                + ","
                + (g.isFiltered() ? 1 : 0)
                + ");");

        for (String key : g.getExtendedAttributes().keySet()) {
          Object val = g.getExtendedAttribute(key);
          if (val == null) continue;
          out.println(
              "insert into GTPROP"
                  + SUFFIX
                  + "(genotype_id,k,v) values ("
                  + "(select max(id) from GENOTYPE"
                  + SUFFIX
                  + "),"
                  + quote(key)
                  + ","
                  + quote(infotoString(val))
                  + ");");
        }
      }
    }
    r.close();
  }
  public static void test2files(String answer, String out) throws IOException {
    // Create key and program response answers
    VariantPoolHeavy answer_pool = null;
    VariantPoolHeavy test_pool = null;
    try {
      test_pool = new VariantPoolHeavy(new File(out), "2", false);
      answer_pool = new VariantPoolHeavy(new File(answer), "1", false);
    } catch (IOException e) {
      e.printStackTrace();
    }

    //		String currVarAnswerKey;
    //		String currVarTestKey;

    //		Iterator<String> key_it = answer_pool.getVariantIterator();
    //		Iterator<String> test_it = test_pool.getVariantIterator();

    VariantContext answer_var;
    VariantContext test_var;

    // Make sure they have same number of variants..
    assertEquals(answer_pool.getNumVarRecords(), test_pool.getNumVarRecords());

    // Make sure they have the same number of samples.
    Assert.assertArrayEquals(answer_pool.getSamples().toArray(), test_pool.getSamples().toArray());

    // Iterate over variants in the key and check if they are equal...
    //		while (key_it.hasNext() && test_it.hasNext()) {
    while ((answer_var = answer_pool.getNextVar()) != null
        && (test_var = test_pool.getNextVar()) != null) {

      //			currVarAnswerKey = key_it.next();
      //			currVarTestKey = test_it.next();

      //			answer_var = answer_pool.getVariant(currVarAnswerKey);
      //			test_var = test_pool.getVariant(currVarTestKey);

      GenotypesContext key_genos = answer_var.getGenotypes();
      GenotypesContext test_genos = test_var.getGenotypes();

      Iterator<Genotype> key_geno_it = key_genos.iterator();
      Iterator<Genotype> test_geno_it = test_genos.iterator();

      // Iterate through the genotypes for the samples in this variant context
      while (key_geno_it.hasNext() && test_geno_it.hasNext()) {
        Genotype curr_key_geno = key_geno_it.next();
        Genotype curr_test_geno = test_geno_it.next();

        // Assert that the genotypes are the same
        Assert.assertArrayEquals(
            curr_key_geno.getAlleles().toArray(), curr_test_geno.getAlleles().toArray());
        // PRINT THE ARRAYS HERE TO DEBUG!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        // Assert that the sample has a read depth
        //				Assert.assertTrue(curr_test_geno.hasDP());
        // Assert that the sample has a genotype quality score
        //				Assert.assertTrue(curr_test_geno.hasGQ());

      }

      // assert that they have the same reference and alternate alleles....
      Assert.assertTrue(
          "Ref-\nkey: " + answer_var.getReference() + " test: " + test_var.getReference() + "\n",
          answer_var.hasSameAllelesAs(test_var));
      Assert.assertTrue(
          "Alt-\nkey: "
              + answer_var.getAlternateAlleles().toString()
              + " test: "
              + test_var.getAlternateAlleles().toString()
              + "\n",
          answer_var.hasSameAlternateAllelesAs(test_var));
    }
  }