public VepPredictionParser(VCFHeader header, String tag) { this.tag = tag; VCFInfoHeaderLine info = header.getInfoHeaderLine(tag); if (info == null || info.getDescription() == null) { System.err.println("NO " + tag + " found in header"); return; } String description = info.getDescription(); String chunck = " Format:"; int i = description.indexOf(chunck); if (i == -1) { System.err.println("Cannot find " + chunck + " in " + description); return; } description = description.substring(i + chunck.length()).replaceAll("[ \'\\.\\(\\)]+", "").trim(); String tokens[] = pipe.split(description); for (i = 0; i < tokens.length; ++i) { if (tokens[i].isEmpty()) continue; COLS col = null; for (COLS c : COLS.values()) { if (c.name().equalsIgnoreCase(tokens[i])) { col = c; } } if (col == null) { System.err.println("Undefined VEP tag " + tokens[i]); } col2col.put(col, i); } }
private void read(InputStream in, String filename) throws IOException { // Pattern comma=Pattern.compile("[,]"); Pattern pipe = Pattern.compile("[\\|]"); Pattern amp = Pattern.compile("&"); out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");"); VcfIterator r = new VcfIterator(in); VCFHeader header = r.getHeader(); String csqColumns[] = null; VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ"); if (infoHeader != null && this.USE_VEP) { LOG.info("parsing VEP " + infoHeader.getDescription()); final String formatStr = "Format: "; int i = infoHeader.getDescription().indexOf(formatStr); if (i != -1) { csqColumns = pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim()); LOG.debug(Arrays.asList(csqColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String snpEffColumns[] = null; infoHeader = header.getInfoHeaderLine("EFF"); if (infoHeader != null && this.USE_SNPEFF) { LOG.info("parsing EFF " + infoHeader.getDescription()); final String formatStr = ".Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); if (i != -1) i = desc.indexOf('(', i + formatStr.length()); int j = desc.lastIndexOf(')'); if (i != -1 && j > i) { snpEffColumns = pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim()); LOG.info(Arrays.asList(snpEffColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String nmdColumns[] = null; infoHeader = header.getInfoHeaderLine("NMD"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { nmdColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String lofColumns[] = null; infoHeader = header.getInfoHeaderLine("LOF"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { lofColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } for (String S : header.getSampleNamesInOrder()) { // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on // SAMPLE.name=vals.y when NOT MATCHED THEN INSERT VALUES vals.x,vals.y; switch (this.engine) { case hsql: out.println( "merge into SAMPLE" + SUFFIX + " using ( values(" + quote(S) + ") ) " + "AS vals(y) ON SAMPLE" + SUFFIX + ".name = vals.y " + "WHEN NOT MATCHED THEN INSERT VALUES (NULL,vals.y);"); break; default: out.println( "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");"); break; } } List<String> headers = new ArrayList<String>(); for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) { if (VCFHeaderVersion.isFormatString(line.getKey())) continue; headers.add(VCFHeader.METADATA_INDICATOR + line); } String chromLine = VCFHeader.HEADER_INDICATOR; for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { if (!VCFHeader.HEADER_INDICATOR.equals(chromLine)) chromLine += (VCFConstants.FIELD_SEPARATOR); chromLine += (field); } if (header.hasGenotypingData()) { chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT"; for (String sample : header.getGenotypeSamples()) { chromLine += VCFConstants.FIELD_SEPARATOR; chromLine += sample; } } headers.add(chromLine); for (String line : headers) { out.println( "insert into HEADER" + SUFFIX + "(file_id,header) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(line) + ");"); } while (r.hasNext()) { VariantContext var = r.next(); if (var == null) { LOG.error("Cannot parse VCF"); continue; } // "create table if not exists FILE(id,filename text)"; // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)"; out.println( "insert into VARIATION" + SUFFIX + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(var.getChr()) + "," + var.getStart() + "," + (var.getStart() - 1) + "," + var.getEnd() + "," + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD) ? "NULL" : quote(var.getID())) + "," + quote(var.getReference().getDisplayString()) + "," + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual()) + ");"); // "create table if not exists ALT(id,var_id,alt)"; for (Allele alt : var.getAlternateAlleles()) { out.println( "insert into ALT" + SUFFIX + "(var_id,alt) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(alt.getDisplayString()) + ");"); } // "create table if not exists FILTER(id,var_id,filter)"; for (String filter : var.getFilters()) { out.println( "insert into FILTER" + SUFFIX + "(var_id,filter) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(filter) + ");"); } CommonInfo infos = var.getCommonInfo(); for (String key : infos.getAttributes().keySet()) { Object val = infos.getAttribute(key); // "create table if not exists INFO(id,var_id,k,v)"; if (SPLIT4 && key.equals("DP4")) { String dp4[] = infotoString(val).split("[,]"); insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0])); insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1])); insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2])); insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3])); } else { insertIntoInfo(quote(key), quote(infotoString(val))); } if (key.equals("CSQ") && csqColumns != null) { List as_array = castToStringArray(val); for (Object csqs : as_array) { if (csqs.toString().isEmpty()) continue; String tokens[] = pipe.split(csqs.toString()); List<String> extraInfo = new ArrayList<String>(); for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) { if (tokens[t].isEmpty()) continue; if (csqColumns[t].equals("Consequence")) { for (String pred : amp.split(tokens[t])) { if (pred.isEmpty()) continue; extraInfo.add(csqColumns[t]); extraInfo.add(pred); } } else { extraInfo.add(csqColumns[t]); extraInfo.add(tokens[t]); } } insertExtraInfos("CSQ", extraInfo); } } if (key.equals("EFF") && snpEffColumns != null) { for (Object item : castToStringArray(val)) { String snpeff = item.toString(); if (snpeff.isEmpty()) continue; int opar = snpeff.indexOf('('); if (opar == -1) continue; int cpar = snpeff.lastIndexOf(')'); if (cpar == -1) continue; String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar)); List<String> h = new ArrayList<String>(); h.add("Effect"); h.add(snpeff.substring(0, opar)); for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(snpEffColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("NMD") && nmdColumns != null) { for (Object item : castToStringArray(val)) { String nmd = item.toString(); if (nmd.isEmpty()) continue; String tokens[] = pipe.split(nmd); List<String> h = new ArrayList<String>(nmdColumns.length * 2); for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(nmdColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("LOF") && lofColumns != null) { for (Object item : castToStringArray(val)) { String lof = item.toString(); if (lof.isEmpty()) continue; String tokens[] = pipe.split(lof); List<String> h = new ArrayList<String>(lofColumns.length * 2); for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(lofColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } } GenotypesContext genotypesCtx = var.getGenotypes(); for (Genotype g : genotypesCtx) { // "create table if not exists GENOTYPE(id,var_id,k,v)"; List<Allele> alleles = g.getAlleles(); out.println( "insert into GENOTYPE" + SUFFIX + "(var_id,sample_id,A1,A2,dp,ad,gq,pl," + "is_phased,is_hom,is_homref,is_homvar,is_mixed," + "is_nocall,is_noninformative,is_available,is_called,is_filtered" + ") values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + "(select id from SAMPLE" + SUFFIX + " where name=" + quote(g.getSampleName()) + ")," + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL") + "," + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL") + "," + (g.hasDP() ? g.getDP() : "NULL") + "," + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL") + "," + (g.hasGQ() ? g.getGQ() : "NULL") + "," + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL") + "," + (g.isPhased() ? 1 : 0) + "," + (g.isHom() ? 1 : 0) + "," + (g.isHomRef() ? 1 : 0) + "," + (g.isHomVar() ? 1 : 0) + "," + (g.isMixed() ? 1 : 0) + "," + (g.isNoCall() ? 1 : 0) + "," + (g.isNonInformative() ? 1 : 0) + "," + (g.isAvailable() ? 1 : 0) + "," + (g.isCalled() ? 1 : 0) + "," + (g.isFiltered() ? 1 : 0) + ");"); for (String key : g.getExtendedAttributes().keySet()) { Object val = g.getExtendedAttribute(key); if (val == null) continue; out.println( "insert into GTPROP" + SUFFIX + "(genotype_id,k,v) values (" + "(select max(id) from GENOTYPE" + SUFFIX + ")," + quote(key) + "," + quote(infotoString(val)) + ");"); } } } r.close(); }