Example #1
0
  /**
   * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention
   * detection and document preprocessing is done here.
   *
   * @throws Exception
   */
  public Document makeDocument(InputDoc input) throws Exception {
    if (input == null) return null;
    Annotation anno = input.annotation;

    // add missing annotation
    if (needMissingAnnotations) {
      addMissingAnnotation(anno);
    }

    if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) {
      anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
    }

    // remove nested NP with same headword except newswire document for chinese

    if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) {
      CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw"));
    }

    // mention detection: MD gives following information about mentions: mention start/end index,
    // span, headword
    // rest information will be set in preprocess step
    List<List<Mention>> mentions = md.findMentions(anno, dict, props);
    Document doc = new Document(input, mentions);

    // find headword for gold mentions
    if (input.goldMentions != null) findGoldMentionHeads(doc);

    // document preprocessing: initialization (assign ID), mention processing (gender, number, type,
    // etc), speaker extraction, etc
    Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder);

    return doc;
  }
Example #2
0
  public static void main(String[] argv)
      throws IOException, CmdLineParser.UnknownOptionException,
          CmdLineParser.IllegalOptionValueException {

    if (argv.length < 4) {
      System.out.println("Usage: hictools pre <options> <inputFile> <outputFile> <genomeID>");
      System.out.println("  <options>: -d only calculate intra chromosome (diagonal) [false]");
      System.out.println(
          "           : -o calculate densities (observed/expected), write to file [false]");
      System.out.println("           : -t <int> only write cells with count above threshold t [0]");
      System.out.println(
          "           : -c <chromosome ID> only calculate map on specific chromosome");
      System.exit(0);
    }

    Globals.setHeadless(true);

    CommandLineParser parser = new CommandLineParser();
    parser.parse(argv);
    String[] args = parser.getRemainingArgs();

    if (args[0].equals("sort")) {
      AlignmentsSorter.sort(args[1], args[2], null);
    } else if (args[0].equals("pairsToBin")) {
      String ifile = args[1];
      String ofile = args[2];
      String genomeId = args[3];
      List<Chromosome> chromosomes = loadChromosomes(genomeId);
      AsciiToBinConverter.convert(ifile, ofile, chromosomes);
    } else if (args[0].equals("binToPairs")) {
      String ifile = args[1];
      String ofile = args[2];
      AsciiToBinConverter.convertBack(ifile, ofile);
    } else if (args[0].equals("printmatrix")) {
      if (args.length < 5) {
        System.err.println(
            "Usage: hictools printmatrix <observed/oe/pearson> hicFile chr1 chr2 binsize");
        System.exit(-1);
      }
      String type = args[1];
      String file = args[2];
      String chr1 = args[3];
      String chr2 = args[4];
      String binSizeSt = args[5];
      int binSize = 0;
      try {
        binSize = Integer.parseInt(binSizeSt);
      } catch (NumberFormatException e) {
        System.err.println("Integer expected.  Found: " + binSizeSt);
        System.exit(-1);
      }

      dumpMatrix(file, chr1, chr2, binSize, type);

    } else if (args[0].equals("eigenvector")) {
      if (args.length < 4) {
        System.err.println("Usage: hictools eigenvector hicFile chr binsize");
      }
      String file = args[1];
      String chr = args[2];
      String binSizeSt = args[3];
      int binSize = 0;
      try {
        binSize = Integer.parseInt(binSizeSt);
      } catch (NumberFormatException e) {
        System.err.println("Integer expected.  Found: " + binSizeSt);
        System.exit(-1);
      }
      calculateEigenvector(file, chr, binSize);
    } else if (args[0].equals("pre")) {
      String genomeId = "";
      try {
        genomeId = args[3];
      } catch (ArrayIndexOutOfBoundsException e) {
        System.err.println("No genome ID given");
        System.exit(0);
      }
      List<Chromosome> chromosomes = loadChromosomes(genomeId);

      long genomeLength = 0;
      for (Chromosome c : chromosomes) {
        if (c != null) genomeLength += c.getSize();
      }
      chromosomes.set(0, new Chromosome(0, "All", (int) (genomeLength / 1000)));

      String[] tokens = args[1].split(",");
      List<String> files = new ArrayList<String>(tokens.length);

      for (String f : tokens) {
        files.add(f);
      }

      Preprocessor preprocessor = new Preprocessor(new File(args[2]), chromosomes);

      preprocessor.setIncludedChromosomes(parser.getChromosomeOption());
      preprocessor.setCountThreshold(parser.getCountThresholdOption());
      preprocessor.setNumberOfThreads(parser.getThreadedOption());
      preprocessor.setDiagonalsOnly(parser.getDiagonalsOption());
      preprocessor.setLoadDensities(parser.getDensitiesOption());
      preprocessor.preprocess(files);
    }
  }
Example #3
0
  public void toTDF(
      String typeString,
      String ifile,
      String ofile,
      String probeFile,
      String genomeId,
      int maxZoomValue,
      Collection<WindowFunction> windowFunctions,
      String tmpDirName,
      int maxRecords)
      throws IOException, PreprocessingException {

    if (!ifile.endsWith(".affective.csv")) validateIsTilable(typeString);

    System.out.println("toTDF.  File = " + ifile);
    System.out.println("Max zoom = " + maxZoomValue);
    if (probeFile != null && probeFile.trim().length() > 0) {
      System.out.println("Probe file = " + probeFile);
    }
    System.out.print("Window functions: ");
    for (WindowFunction wf : windowFunctions) {
      System.out.print(wf.toString() + " ");
    }
    System.out.println();

    boolean isGCT = isGCT(typeString);
    Genome genome = loadGenome(genomeId, isGCT);
    if (genome == null) {
      throw new PreprocessingException("Genome could not be loaded: " + genomeId);
    }
    File inputFileOrDir = new File(ifile);

    // Estimae the total number of lines to be parsed, for progress updates
    int nLines = estimateLineCount(inputFileOrDir);

    // TODO -- move this block of code out of here, this should be done before calling this method
    // Convert  gct files to igv format first
    File deleteme = null;
    if (isGCT(typeString)) {
      File tmpDir = null;
      if (tmpDirName != null && tmpDirName.length() > 0) {
        tmpDir = new File(tmpDirName);
        if (!tmpDir.exists() || !tmpDir.isDirectory()) {
          throw new PreprocessingException(
              "Specified tmp directory does not exist or is not directory: " + tmpDirName);
        }
      } else {
        tmpDir = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name"));
      }
      if (!tmpDir.exists()) {
        tmpDir.mkdir();
      }

      String baseName = (new File(ifile)).getName();
      File igvFile = new File(tmpDir, baseName + ".igv");
      igvFile.deleteOnExit();
      doGCTtoIGV(typeString, ifile, igvFile, probeFile, maxRecords, tmpDirName, genome);

      inputFileOrDir = igvFile;
      deleteme = igvFile;
      typeString = ".igv";
    }

    // Convert to tdf
    File outputFile = new File(ofile);
    try {
      Preprocessor p = new Preprocessor(outputFile, genome, windowFunctions, nLines, null);
      if (inputFileOrDir.isDirectory() || inputFileOrDir.getName().endsWith(".list")) {
        List<File> files = getFilesFromDirOrList(inputFileOrDir);
        for (File f : files) {
          p.preprocess(f, maxZoomValue, typeString);
        }
      } else {
        p.preprocess(inputFileOrDir, maxZoomValue, typeString);
      }
      p.finish();
    } catch (IOException e) {
      e.printStackTrace();
      // Delete output file as its probably corrupt
      if (outputFile.exists()) {
        outputFile.delete();
      }
    } finally {
      if (deleteme != null && deleteme.exists()) {
        deleteme.delete();
      }
    }

    System.out.flush();
  }
package antenna.preprocessor.v3;