/** * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention * detection and document preprocessing is done here. * * @throws Exception */ public Document makeDocument(InputDoc input) throws Exception { if (input == null) return null; Annotation anno = input.annotation; // add missing annotation if (needMissingAnnotations) { addMissingAnnotation(anno); } if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) { anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true); } // remove nested NP with same headword except newswire document for chinese if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) { CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw")); } // mention detection: MD gives following information about mentions: mention start/end index, // span, headword // rest information will be set in preprocess step List<List<Mention>> mentions = md.findMentions(anno, dict, props); Document doc = new Document(input, mentions); // find headword for gold mentions if (input.goldMentions != null) findGoldMentionHeads(doc); // document preprocessing: initialization (assign ID), mention processing (gender, number, type, // etc), speaker extraction, etc Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder); return doc; }
public static void main(String[] argv) throws IOException, CmdLineParser.UnknownOptionException, CmdLineParser.IllegalOptionValueException { if (argv.length < 4) { System.out.println("Usage: hictools pre <options> <inputFile> <outputFile> <genomeID>"); System.out.println(" <options>: -d only calculate intra chromosome (diagonal) [false]"); System.out.println( " : -o calculate densities (observed/expected), write to file [false]"); System.out.println(" : -t <int> only write cells with count above threshold t [0]"); System.out.println( " : -c <chromosome ID> only calculate map on specific chromosome"); System.exit(0); } Globals.setHeadless(true); CommandLineParser parser = new CommandLineParser(); parser.parse(argv); String[] args = parser.getRemainingArgs(); if (args[0].equals("sort")) { AlignmentsSorter.sort(args[1], args[2], null); } else if (args[0].equals("pairsToBin")) { String ifile = args[1]; String ofile = args[2]; String genomeId = args[3]; List<Chromosome> chromosomes = loadChromosomes(genomeId); AsciiToBinConverter.convert(ifile, ofile, chromosomes); } else if (args[0].equals("binToPairs")) { String ifile = args[1]; String ofile = args[2]; AsciiToBinConverter.convertBack(ifile, ofile); } else if (args[0].equals("printmatrix")) { if (args.length < 5) { System.err.println( "Usage: hictools printmatrix <observed/oe/pearson> hicFile chr1 chr2 binsize"); System.exit(-1); } String type = args[1]; String file = args[2]; String chr1 = args[3]; String chr2 = args[4]; String binSizeSt = args[5]; int binSize = 0; try { binSize = Integer.parseInt(binSizeSt); } catch (NumberFormatException e) { System.err.println("Integer expected. Found: " + binSizeSt); System.exit(-1); } dumpMatrix(file, chr1, chr2, binSize, type); } else if (args[0].equals("eigenvector")) { if (args.length < 4) { System.err.println("Usage: hictools eigenvector hicFile chr binsize"); } String file = args[1]; String chr = args[2]; String binSizeSt = args[3]; int binSize = 0; try { binSize = Integer.parseInt(binSizeSt); } catch (NumberFormatException e) { System.err.println("Integer expected. Found: " + binSizeSt); System.exit(-1); } calculateEigenvector(file, chr, binSize); } else if (args[0].equals("pre")) { String genomeId = ""; try { genomeId = args[3]; } catch (ArrayIndexOutOfBoundsException e) { System.err.println("No genome ID given"); System.exit(0); } List<Chromosome> chromosomes = loadChromosomes(genomeId); long genomeLength = 0; for (Chromosome c : chromosomes) { if (c != null) genomeLength += c.getSize(); } chromosomes.set(0, new Chromosome(0, "All", (int) (genomeLength / 1000))); String[] tokens = args[1].split(","); List<String> files = new ArrayList<String>(tokens.length); for (String f : tokens) { files.add(f); } Preprocessor preprocessor = new Preprocessor(new File(args[2]), chromosomes); preprocessor.setIncludedChromosomes(parser.getChromosomeOption()); preprocessor.setCountThreshold(parser.getCountThresholdOption()); preprocessor.setNumberOfThreads(parser.getThreadedOption()); preprocessor.setDiagonalsOnly(parser.getDiagonalsOption()); preprocessor.setLoadDensities(parser.getDensitiesOption()); preprocessor.preprocess(files); } }
public void toTDF( String typeString, String ifile, String ofile, String probeFile, String genomeId, int maxZoomValue, Collection<WindowFunction> windowFunctions, String tmpDirName, int maxRecords) throws IOException, PreprocessingException { if (!ifile.endsWith(".affective.csv")) validateIsTilable(typeString); System.out.println("toTDF. File = " + ifile); System.out.println("Max zoom = " + maxZoomValue); if (probeFile != null && probeFile.trim().length() > 0) { System.out.println("Probe file = " + probeFile); } System.out.print("Window functions: "); for (WindowFunction wf : windowFunctions) { System.out.print(wf.toString() + " "); } System.out.println(); boolean isGCT = isGCT(typeString); Genome genome = loadGenome(genomeId, isGCT); if (genome == null) { throw new PreprocessingException("Genome could not be loaded: " + genomeId); } File inputFileOrDir = new File(ifile); // Estimae the total number of lines to be parsed, for progress updates int nLines = estimateLineCount(inputFileOrDir); // TODO -- move this block of code out of here, this should be done before calling this method // Convert gct files to igv format first File deleteme = null; if (isGCT(typeString)) { File tmpDir = null; if (tmpDirName != null && tmpDirName.length() > 0) { tmpDir = new File(tmpDirName); if (!tmpDir.exists() || !tmpDir.isDirectory()) { throw new PreprocessingException( "Specified tmp directory does not exist or is not directory: " + tmpDirName); } } else { tmpDir = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name")); } if (!tmpDir.exists()) { tmpDir.mkdir(); } String baseName = (new File(ifile)).getName(); File igvFile = new File(tmpDir, baseName + ".igv"); igvFile.deleteOnExit(); doGCTtoIGV(typeString, ifile, igvFile, probeFile, maxRecords, tmpDirName, genome); inputFileOrDir = igvFile; deleteme = igvFile; typeString = ".igv"; } // Convert to tdf File outputFile = new File(ofile); try { Preprocessor p = new Preprocessor(outputFile, genome, windowFunctions, nLines, null); if (inputFileOrDir.isDirectory() || inputFileOrDir.getName().endsWith(".list")) { List<File> files = getFilesFromDirOrList(inputFileOrDir); for (File f : files) { p.preprocess(f, maxZoomValue, typeString); } } else { p.preprocess(inputFileOrDir, maxZoomValue, typeString); } p.finish(); } catch (IOException e) { e.printStackTrace(); // Delete output file as its probably corrupt if (outputFile.exists()) { outputFile.delete(); } } finally { if (deleteme != null && deleteme.exists()) { deleteme.delete(); } } System.out.flush(); }
package antenna.preprocessor.v3;