/** @throws IOException */ private void readFileHeader() throws Exception { // fisrt four bytes are empty // it should be zero for new version of control file, backward compatibility int emptyBytes = this.readFourBytes(inputStream); if (emptyBytes != 0) { log.warn( "The first four bytes are not zero: " + emptyBytes + ". This is an old format control file."); this.totalClusters = emptyBytes; return; } // next four bytes should be version and greater or equal to the expected int version = this.readFourBytes(inputStream); if (version != this.EXPECTED_CONTROL_VERSION) { log.error("Unexpected version byte: " + version); throw new Exception("Unexpected version number in control file"); } // next four bytes should be the total number of clusters this.totalClusters = this.readFourBytes(inputStream); log.info("The total number of clusters: " + this.getTotalClusters()); }
@Override public Object next() { try { int nextByte = this.inputStream.readUnsignedShort(); if (nextByte == -1) { log.warn( "There is no more cluster in Control file after cluster " + this.getCurrentCluster() + " in file " + this.getFileName()); return null; } this.currentCluster++; /* Bit0: always empty (0) Bit1: was the read identified as a control? Bit2: was the match ambiguous? Bit3: did the read match the phiX tag? Bit4: did the read align to match the phiX tag? Bit5: did the read match the control index sequence? (specified in controls.fata, TGTCACA) Bits6,7: reserved for future use Bits8..15: the report key for the matched record in the controls.fasta file (specified by the REPOControl FilesRT_ KEY metadata) */ nextByte = nextByte & 0x2; if (nextByte != 0) { this.currentControlClusters++; } return new Integer(nextByte); } catch (IOException ex) { log.error(ex, "Problem to read control file"); } return null; }
private void read(InputStream in, String filename) throws IOException { // Pattern comma=Pattern.compile("[,]"); Pattern pipe = Pattern.compile("[\\|]"); Pattern amp = Pattern.compile("&"); out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");"); VcfIterator r = new VcfIterator(in); VCFHeader header = r.getHeader(); String csqColumns[] = null; VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ"); if (infoHeader != null && this.USE_VEP) { LOG.info("parsing VEP " + infoHeader.getDescription()); final String formatStr = "Format: "; int i = infoHeader.getDescription().indexOf(formatStr); if (i != -1) { csqColumns = pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim()); LOG.debug(Arrays.asList(csqColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String snpEffColumns[] = null; infoHeader = header.getInfoHeaderLine("EFF"); if (infoHeader != null && this.USE_SNPEFF) { LOG.info("parsing EFF " + infoHeader.getDescription()); final String formatStr = ".Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); if (i != -1) i = desc.indexOf('(', i + formatStr.length()); int j = desc.lastIndexOf(')'); if (i != -1 && j > i) { snpEffColumns = pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim()); LOG.info(Arrays.asList(snpEffColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String nmdColumns[] = null; infoHeader = header.getInfoHeaderLine("NMD"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { nmdColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String lofColumns[] = null; infoHeader = header.getInfoHeaderLine("LOF"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { lofColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } for (String S : header.getSampleNamesInOrder()) { // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on // SAMPLE.name=vals.y when NOT MATCHED THEN INSERT VALUES vals.x,vals.y; switch (this.engine) { case hsql: out.println( "merge into SAMPLE" + SUFFIX + " using ( values(" + quote(S) + ") ) " + "AS vals(y) ON SAMPLE" + SUFFIX + ".name = vals.y " + "WHEN NOT MATCHED THEN INSERT VALUES (NULL,vals.y);"); break; default: out.println( "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");"); break; } } List<String> headers = new ArrayList<String>(); for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) { if (VCFHeaderVersion.isFormatString(line.getKey())) continue; headers.add(VCFHeader.METADATA_INDICATOR + line); } String chromLine = VCFHeader.HEADER_INDICATOR; for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { if (!VCFHeader.HEADER_INDICATOR.equals(chromLine)) chromLine += (VCFConstants.FIELD_SEPARATOR); chromLine += (field); } if (header.hasGenotypingData()) { chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT"; for (String sample : header.getGenotypeSamples()) { chromLine += VCFConstants.FIELD_SEPARATOR; chromLine += sample; } } headers.add(chromLine); for (String line : headers) { out.println( "insert into HEADER" + SUFFIX + "(file_id,header) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(line) + ");"); } while (r.hasNext()) { VariantContext var = r.next(); if (var == null) { LOG.error("Cannot parse VCF"); continue; } // "create table if not exists FILE(id,filename text)"; // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)"; out.println( "insert into VARIATION" + SUFFIX + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(var.getChr()) + "," + var.getStart() + "," + (var.getStart() - 1) + "," + var.getEnd() + "," + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD) ? "NULL" : quote(var.getID())) + "," + quote(var.getReference().getDisplayString()) + "," + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual()) + ");"); // "create table if not exists ALT(id,var_id,alt)"; for (Allele alt : var.getAlternateAlleles()) { out.println( "insert into ALT" + SUFFIX + "(var_id,alt) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(alt.getDisplayString()) + ");"); } // "create table if not exists FILTER(id,var_id,filter)"; for (String filter : var.getFilters()) { out.println( "insert into FILTER" + SUFFIX + "(var_id,filter) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(filter) + ");"); } CommonInfo infos = var.getCommonInfo(); for (String key : infos.getAttributes().keySet()) { Object val = infos.getAttribute(key); // "create table if not exists INFO(id,var_id,k,v)"; if (SPLIT4 && key.equals("DP4")) { String dp4[] = infotoString(val).split("[,]"); insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0])); insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1])); insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2])); insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3])); } else { insertIntoInfo(quote(key), quote(infotoString(val))); } if (key.equals("CSQ") && csqColumns != null) { List as_array = castToStringArray(val); for (Object csqs : as_array) { if (csqs.toString().isEmpty()) continue; String tokens[] = pipe.split(csqs.toString()); List<String> extraInfo = new ArrayList<String>(); for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) { if (tokens[t].isEmpty()) continue; if (csqColumns[t].equals("Consequence")) { for (String pred : amp.split(tokens[t])) { if (pred.isEmpty()) continue; extraInfo.add(csqColumns[t]); extraInfo.add(pred); } } else { extraInfo.add(csqColumns[t]); extraInfo.add(tokens[t]); } } insertExtraInfos("CSQ", extraInfo); } } if (key.equals("EFF") && snpEffColumns != null) { for (Object item : castToStringArray(val)) { String snpeff = item.toString(); if (snpeff.isEmpty()) continue; int opar = snpeff.indexOf('('); if (opar == -1) continue; int cpar = snpeff.lastIndexOf(')'); if (cpar == -1) continue; String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar)); List<String> h = new ArrayList<String>(); h.add("Effect"); h.add(snpeff.substring(0, opar)); for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(snpEffColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("NMD") && nmdColumns != null) { for (Object item : castToStringArray(val)) { String nmd = item.toString(); if (nmd.isEmpty()) continue; String tokens[] = pipe.split(nmd); List<String> h = new ArrayList<String>(nmdColumns.length * 2); for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(nmdColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("LOF") && lofColumns != null) { for (Object item : castToStringArray(val)) { String lof = item.toString(); if (lof.isEmpty()) continue; String tokens[] = pipe.split(lof); List<String> h = new ArrayList<String>(lofColumns.length * 2); for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(lofColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } } GenotypesContext genotypesCtx = var.getGenotypes(); for (Genotype g : genotypesCtx) { // "create table if not exists GENOTYPE(id,var_id,k,v)"; List<Allele> alleles = g.getAlleles(); out.println( "insert into GENOTYPE" + SUFFIX + "(var_id,sample_id,A1,A2,dp,ad,gq,pl," + "is_phased,is_hom,is_homref,is_homvar,is_mixed," + "is_nocall,is_noninformative,is_available,is_called,is_filtered" + ") values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + "(select id from SAMPLE" + SUFFIX + " where name=" + quote(g.getSampleName()) + ")," + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL") + "," + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL") + "," + (g.hasDP() ? g.getDP() : "NULL") + "," + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL") + "," + (g.hasGQ() ? g.getGQ() : "NULL") + "," + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL") + "," + (g.isPhased() ? 1 : 0) + "," + (g.isHom() ? 1 : 0) + "," + (g.isHomRef() ? 1 : 0) + "," + (g.isHomVar() ? 1 : 0) + "," + (g.isMixed() ? 1 : 0) + "," + (g.isNoCall() ? 1 : 0) + "," + (g.isNonInformative() ? 1 : 0) + "," + (g.isAvailable() ? 1 : 0) + "," + (g.isCalled() ? 1 : 0) + "," + (g.isFiltered() ? 1 : 0) + ");"); for (String key : g.getExtendedAttributes().keySet()) { Object val = g.getExtendedAttribute(key); if (val == null) continue; out.println( "insert into GTPROP" + SUFFIX + "(genotype_id,k,v) values (" + "(select max(id) from GENOTYPE" + SUFFIX + ")," + quote(key) + "," + quote(infotoString(val)) + ");"); } } } r.close(); }
@Override protected int doWork() { try { try { this.engine = SQLEngine.valueOf(this.ENGINE); } catch (Exception err) { LOG.error("BAD SQL ENGINE " + this.ENGINE); return -1; } out.println( "create table if not exists FILE" + SUFFIX + "(" + columnId() + "filename " + varchar(255) + " NOT NULL" + ");"); out.println( "create table if not exists HEADER" + SUFFIX + "(" + columnId() + "file_id INT NOT NULL REFERENCES FILE" + SUFFIX + "(id) ON DELETE CASCADE," + "header " + text() + ");"); out.println( "create table if not exists SAMPLE" + SUFFIX + "(" + columnId() + "name " + varchar(100) + " NOT NULL UNIQUE" + ");"); out.println( "create table if not exists VARIATION" + SUFFIX + "(" + columnId() + "file_id INT NOT NULL REFERENCES FILE" + SUFFIX + "(id) ON DELETE CASCADE," + "CHROM VARCHAR(20) NOT NULL," + "POS INT NOT NULL," + "START0 INT NOT NULL," + "END0 INT NOT NULL," + "RS_ID VARCHAR(50)," + "REF " + text() + " NOT NULL," + "QUAL FLOAT" + ");"); out.println( "create table if not exists ALT" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "ALT " + text() + ");"); out.println( "create table if not exists FILTER" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "FILTER varchar(50) not null" + ");"); out.println( "create table if not exists INFO" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); out.println( "create table if not exists EXTRAINFO" + SUFFIX + "(" + columnId() + "info_id INT NOT NULL REFERENCES INFO" + SUFFIX + "(id) ON DELETE CASCADE," + "type varchar(50) not null" + ");"); out.println( "create table if not exists EXTRAINFOPROP" + SUFFIX + "(" + columnId() + "extrainfo_id INT NOT NULL REFERENCES EXTRAINFO" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); out.println( "create table if not exists GENOTYPE" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "sample_id INT NOT NULL REFERENCES SAMPLE" + SUFFIX + "(id) ON DELETE CASCADE," + "A1 " + text() + ", A2 " + text() + ", dp int, ad varchar(50), gq float,pl " + text() + "," + "is_phased SMALLINT not null,is_hom SMALLINT not null,is_homref SMALLINT not null,is_homvar SMALLINT not null,is_mixed SMALLINT not null," + "is_nocall SMALLINT not null,is_noninformative SMALLINT not null,is_available SMALLINT not null,is_called SMALLINT not null,is_filtered SMALLINT not null" + ");"); out.println( "create table if not exists GTPROP" + SUFFIX + "(" + columnId() + "genotype_id INT NOT NULL REFERENCES GENOTYPE" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); switch (this.engine) { case sqlite: out.println("begin transaction;"); break; default: break; } if (IN.isEmpty()) { LOG.info("reading from stdin"); read(System.in, "<stdin>"); } else { for (File input : IN) { LOG.info("opening " + input); InputStream in = IOUtils.openFileForReading(input); read(in, input.toString()); in.close(); } } if (SQLINDEX) { index("SAMPLE", "name"); index("EXTRAINFO", "type"); index("EXTRAINFOPROP", "k"); index("EXTRAINFOPROP", "v"); index("INFO", "var_id"); index("INFO", "k"); index("EXTRAINFO", "info_id"); index("EXTRAINFOPROP", "extrainfo_id"); index("GENOTYPE", "var_id"); index("GENOTYPE", "sample_id"); } switch (this.engine) { case sqlite: out.println("commit;"); break; default: break; } out.flush(); } catch (IOException err) { err.printStackTrace(); return -1; } return 0; }