Ejemplo n.º 1
0
  private InstanceList readFile() throws IOException {

    String NL = System.getProperty("line.separator");
    Scanner scanner = new Scanner(new FileInputStream(fileName), encoding);

    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
    pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+")));
    pipeList.add(new TokenSequence2FeatureSequence());

    InstanceList testing = new InstanceList(new SerialPipes(pipeList));

    try {
      while (scanner.hasNextLine()) {

        String text = scanner.nextLine();
        text = text.replaceAll("\\x0d", "");

        Pattern patten = Pattern.compile("^(.*?),(.*?),(.*)$");
        Matcher matcher = patten.matcher(text);

        if (matcher.find()) {
          docIds.add(matcher.group(1));
          testing.addThruPipe(new Instance(matcher.group(3), null, "test instance", null));
        }
      }
    } finally {
      scanner.close();
    }

    return testing;
  }
 @Override
 public void process(JCas aJCas) throws AnalysisEngineProcessException {
   DocumentMetaData metadata = DocumentMetaData.get(aJCas);
   try {
     for (TokenSequence ts : generateTokenSequences(aJCas)) {
       instanceList.addThruPipe(
           new Instance(ts, NONE_LABEL, metadata.getDocumentId(), metadata.getDocumentUri()));
     }
   } catch (FeaturePathException e) {
     throw new AnalysisEngineProcessException(e);
   }
 }
Ejemplo n.º 3
0
  public void addDocument(String key, String body, String title) {
    // this.docContent.add(body);

    list.addThruPipe(new Instance(body, key, title, null));

    // Document doc = new Document(this.ts,-1,title,body);
    // this.documents.add(doc);
    this.documents.add(body);

    // return doc;

  }
Ejemplo n.º 4
0
  public InstanceList processDocs(Collection<ConnectionsDocument> docsToRun) {
    InstanceList instanceList = new InstanceList(new Noop());
    int count = 0;
    for (ConnectionsDocument doc : docsToRun) {
      if (count > 11) {
        log.info("Stopping at " + (count - 1) + " documents");
        break;
      }
      Instance instance = new Instance(doc, null, null, null);
      instanceList.addThruPipe(instance);
      // count++;
    }

    pipes.addFirst(new Sentence2TokenSequence(skipAbbrev, inputTokenSet));
    pipes.addFirst(new DocumentToSentencesPipe(inputTokenSet));

    SerialPipes pipeSerial = new SerialPipes(pipes);
    InstanceList instanceListx = new InstanceList(pipeSerial);
    instanceListx.addThruPipe(instanceList.iterator());
    instanceList = instanceListx;
    return instanceList;
  }
 public InstanceList readArray(String[] cleanTexts) {
   StringArrayIterator iterator = new StringArrayIterator(cleanTexts);
   // Construct a new instance list, passing it the pipe we want to use to
   // process instances.
   InstanceList instances = new InstanceList(pipe);
   int index = 0;
   for (Instance inst : instances) {
     inst.setName(name_id.get(index));
     inst.setTarget("english");
     index++;
   }
   // Now process each instance provided by the iterator.
   instances.addThruPipe(iterator);
   return instances;
 }
  // in the training feature table
  // Lines should be formatted as:
  //
  //   [name] [label] [data ... ]
  //
  public static Classifier TrainMaxent(String trainingFilename, File modelFile) throws IOException {
    // build data input pipe
    ArrayList<Pipe> pipes = new ArrayList<Pipe>();

    // define pipe
    // the features in [data ...] should like: feature:value
    pipes.add(new Target2Label());
    pipes.add(new Csv2FeatureVector());

    Pipe pipe = new SerialPipes(pipes);
    pipe.setTargetProcessing(true);

    // read data
    InstanceList trainingInstances = new InstanceList(pipe);
    FileReader training_file_reader = new FileReader(trainingFilename);
    CsvIterator reader =
        new CsvIterator(
            training_file_reader,
            "(\\w+)\\s+([^\\s]+)\\s+(.*)",
            3,
            2,
            1); // (data, label, name) field indices
    trainingInstances.addThruPipe(reader);
    training_file_reader.close();

    // calculate running time
    long startTime = System.currentTimeMillis();
    PrintStream temp = System.err;
    System.setErr(System.out);

    // train a Maxent classifier (could be other classifiers)
    ClassifierTrainer trainer = new MaxEntTrainer(Gaussian_Variance);
    Classifier classifier = trainer.train(trainingInstances);

    System.setErr(temp);
    // calculate running time
    long endTime = System.currentTimeMillis();
    long totalTime = endTime - startTime;
    System.out.println("Total training time: " + totalTime);

    // write model
    ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelFile));
    oos.writeObject(classifier);
    oos.close();

    return classifier;
  }
Ejemplo n.º 7
0
  private InstanceList generateInstanceList() throws Exception {

    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
    pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+")));
    pipeList.add(new TokenSequence2FeatureSequence());

    Reader fileReader = new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8");
    InstanceList instances = new InstanceList(new SerialPipes(pipeList));
    instances.addThruPipe(
        new CsvIterator(
            fileReader,
            Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"),
            3,
            2,
            1)); // data, label, name fields

    return instances;
  }
Ejemplo n.º 8
0
  /**
   * Prepare Instances for use with LDA.
   *
   * @param r
   * @return
   */
  public static InstanceList loadInstancesLDA(Reader r) {
    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();

    // Pipes: lowercase, tokenize, remove stopwords, map to features
    pipeList.add(new Target2Label());
    pipeList.add(new CharSequenceLowercase());
    pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")));
    pipeList.add(
        new TokenSequenceRemoveStopwords(stopWords, stopWordsEncoding, false, false, false));
    pipeList.add(new TokenSequence2FeatureSequence());
    SerialPipes pipes = new SerialPipes(pipeList);

    InstanceList instances = new InstanceList(pipes);

    // create instances with: 3: data; 2: label; 1: name fields
    instances.addThruPipe(new CsvIterator(r, Pattern.compile("(.*)\t(.*)\t(.*)"), 3, 2, 1));

    return instances;
  }
Ejemplo n.º 9
0
  public void test() throws Exception {

    ParallelTopicModel model = ParallelTopicModel.read(new File(inferencerFile));
    TopicInferencer inferencer = model.getInferencer();

    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
    pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+")));
    pipeList.add(new TokenSequence2FeatureSequence());

    InstanceList instances = new InstanceList(new SerialPipes(pipeList));
    Reader fileReader = new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8");
    instances.addThruPipe(
        new CsvIterator(
            fileReader,
            Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"),
            3,
            2,
            1)); // data, label, name fields
    double[] testProbabilities = inferencer.getSampledDistribution(instances.get(1), 10, 1, 5);
    for (int i = 0; i < 1000; i++) System.out.println(i + ": " + testProbabilities[i]);
  }
Ejemplo n.º 10
0
  public TestCRFPipe(String trainingFilename) throws IOException {

    ArrayList<Pipe> pipes = new ArrayList<Pipe>();

    PrintWriter out = new PrintWriter("test.out");

    int[][] conjunctions = new int[3][];
    conjunctions[0] = new int[] {-1};
    conjunctions[1] = new int[] {1};
    conjunctions[2] = new int[] {-2, -1};

    pipes.add(new SimpleTaggerSentence2TokenSequence());
    // pipes.add(new FeaturesInWindow("PREV-", -1, 1));
    // pipes.add(new FeaturesInWindow("NEXT-", 1, 2));
    pipes.add(new OffsetConjunctions(conjunctions));
    pipes.add(new TokenTextCharSuffix("C1=", 1));
    pipes.add(new TokenTextCharSuffix("C2=", 2));
    pipes.add(new TokenTextCharSuffix("C3=", 3));
    pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
    pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
    pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
    pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*")));
    pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
    pipes.add(new TokenSequence2FeatureVectorSequence());
    pipes.add(new SequencePrintingPipe(out));

    Pipe pipe = new SerialPipes(pipes);

    InstanceList trainingInstances = new InstanceList(pipe);

    trainingInstances.addThruPipe(
        new LineGroupIterator(
            new BufferedReader(
                new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))),
            Pattern.compile("^\\s*$"),
            true));

    out.close();
  }
  private void addInstancesThroughPipe(
      PDTBRelation relation,
      Document document,
      int arg1Line,
      int arg2Line,
      InstanceList instanceList) {
    // System.out.println("Relation: " + relation.toString());
    // System.out.println("arg1Line: " + arg1Line);
    // System.out.println("arg2Line: " + arg2Line);

    String connectiveGornAddress = relation.getConnectiveGornAddress();
    Tree arg2Tree = document.getTree(arg2Line);
    List<Tree> connHeadLeaves =
        connAnalyzer.getConnHeadLeaves(arg2Tree, connectiveGornAddress, relation.getConnHead());
    if (connHeadLeaves.isEmpty()) return;

    int connStart = treeAnalyzer.getLeafPosition(arg2Tree, connHeadLeaves.get(0));
    int connEnd =
        treeAnalyzer.getLeafPosition(arg2Tree, connHeadLeaves.get(connHeadLeaves.size() - 1));
    if ((connEnd - connStart) > 4) { // handle if..else, etc.
      connEnd = connStart;
    }

    // consider only the first sentence in case of multi-line argument1
    String arg1GornAddress = relation.getArg1GornAddress();
    Tree arg1Tree = document.getTree(arg1Line);
    List<Tree> arg1GornNodes = getArgGornNodes(arg1Tree, arg1Line, arg1GornAddress);

    Tree syntacticHead = headAnalyzer.getSyntacticHead(arg1Tree, arg1GornNodes);
    int arg1HeadPos = treeAnalyzer.getLeafPosition(arg1Tree, syntacticHead);

    String arg2GornAddress = relation.getArg2GornAddress();
    List<Tree> arg2GornNodes = getArgGornNodes(arg2Tree, arg2Line, arg2GornAddress);

    Tree arg2SyntacticHead = headAnalyzer.getSyntacticHead(arg2Tree, arg2GornNodes);
    int arg2HeadPos = treeAnalyzer.getLeafPosition(arg2Tree, arg2SyntacticHead);

    if (arg2HeadPos == -1) {
      System.out.println("arg2Head == -1");
      return;
    }
    if (arg1HeadPos == -1) {
      System.out.println("arg1Head == -1");
      return;
    }
    int trueCandidate = -1;
    List<Pair<Integer, Integer>> candidates =
        getCandidates(document, arg2Line, connStart, connEnd, arg1Line);
    for (int i = 0; i < candidates.size(); i++) {
      Pair<Integer, Integer> candidate = candidates.get(i);
      if (candidate.first() == arg1Line && candidate.second() == arg1HeadPos) {
        trueCandidate = i;
        break;
      }
    }
    if (trueCandidate == -1) {
      // trueCandidate = candidates.size();
      // candidates.add(new Pair<Integer, Integer>(arg1Line, arg1HeadPos));
      // System.out.println("Covered!");
      System.out.println("true candidate == -1!!!");
      System.out.println(syntacticHead.value());
    } else {
      int extractArg2 =
          ARG2_EXTRACTOR.extractArg2(
              document.getSentence(arg2Line),
              document.getTree(arg2Line),
              document.getDepGraph(arg2Line),
              connStart,
              connEnd);
      if (extractArg2 == -1) {
        extractArg2 = 0;
        System.out.println("Arg2 == -1!!!!!!!!!!!!!!!!!");
      }
      // Arg1RankInstance instance = new Arg1RankInstance(document, candidates, arg2Line,
      // extractArg2, connStart, connEnd, trueCandidate);

      Arg1RankInstance instance =
          new Arg1RankInstance(
              document, candidates, arg2Line, arg2HeadPos, connStart, connEnd, trueCandidate);
      instanceList.addThruPipe(instance);
    }
  }
Ejemplo n.º 12
0
  /**
   * @param targetTerm
   * @param sourceFile
   * @param termWindowSize
   * @param pipe
   */
  private static InstanceList readConcordanceFileToInstanceList(
      String targetTerm,
      String sourceFile,
      int termWindowSize,
      Pipe pipe,
      boolean useCollocationalVector) {
    InstanceList instanceList = new InstanceList(pipe);
    BufferedReader in = null;
    try {
      in = new BufferedReader(new FileReader(sourceFile));
      int incomplete = 0;

      String str;
      while ((str = in.readLine()) != null) {
        String[] lineArray = str.split(";");

        if (lineArray.length != 4) {
          System.out.println(
              "WARNING: Skipping possibly invalid CSV line " + str + " in file " + sourceFile);
          continue;
        }

        String docID = lineArray[0].replace("Doc ID: ", "").trim();
        String lineID = lineArray[1].replace("Line ID: ", "").trim();
        String instanceID = (docID + "_" + lineID).replaceAll(" ", "_");
        String senseID = lineArray[2].replace("DOE sense ID: ", "").trim();
        String text = lineArray[3];

        if (targetTerm.equals("faeder")) targetTerm = "fæder";

        ArrayList<String> data = corpus.getWindowTokens(targetTerm, docID, lineID, termWindowSize);

        if (data.size() != 2 * termWindowSize) {
          incomplete++;
          System.out.println("WARNING: Incomplete token list " + incomplete + " found " + data);
        }

        if (useCollocationalVector) {
          System.out.println("Converting data to collocational vector: \n\t" + data);
          int i = termWindowSize * (-1);
          int index = i + termWindowSize;

          while (i <= termWindowSize && index < data.size()) {
            if (i != 0) {
              data.set(index, data.get(index) + "_" + i); // skip position of target term
              index++;
            }

            i++;
          }
          System.out.println("Converting data to collocational vector...DONE\n\t" + data);
        }

        String dataStr =
            data.toString().replace(", ", " ").replace("[", "").replace("]", "").replace(".", "");
        Instance trainingInstance = new Instance(dataStr, senseID, instanceID, text);

        instanceList.addThruPipe(trainingInstance);
      }
      in.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      if (in != null)
        try {
          in.close();
        } catch (IOException e1) {
        }
    }

    return instanceList;
  }
Ejemplo n.º 13
0
  public InstanceList createFeatureData(
      final ArrayList<Sentence> sentences, final Properties featureConfig) {

    final FeatureConfiguration fc = new FeatureConfiguration();
    final ArrayList<Pipe> pipeParam = new ArrayList<Pipe>();

    // base pipe
    pipeParam.add(new BasePipe(featureConfig));

    // default surface patterns
    pipeParam.add(
        new RegexMatches(
            "INITLOWCAPS_ANYTHING_NONUMBER",
            Pattern.compile("[" + UNICODE_LOWER + "][" + UNICODE_UPPER + "][^0-9]*")));
    pipeParam.add(
        new RegexMatches(
            "INITLOWCAPS_ANYTHING_WITHNUMBER",
            Pattern.compile("[" + UNICODE_LOWER + "][" + UNICODE_UPPER + "].*[0-9].*")));
    pipeParam.add(new RegexMatches("INITCAPS", Pattern.compile("[" + UNICODE_UPPER + "].*")));
    pipeParam.add(
        new RegexMatches(
            "INITCAPSALPHA", Pattern.compile("[" + UNICODE_UPPER + "][" + UNICODE_LOWER + "].*")));
    pipeParam.add(new RegexMatches("ALLCAPS", Pattern.compile("[" + UNICODE_UPPER + "]+")));
    pipeParam.add(
        new RegexMatches("CAPSMIX", Pattern.compile("[" + UNICODE_UPPER + UNICODE_LOWER + "]+")));
    pipeParam.add(new RegexMatches("HASDIGIT", Pattern.compile(".*[0-9].*")));
    pipeParam.add(new RegexMatches("SINGLEDIGIT", Pattern.compile("[0-9]")));
    pipeParam.add(new RegexMatches("DOUBLEDIGIT", Pattern.compile("[0-9][0-9]")));
    pipeParam.add(new RegexMatches("NATURALNUMBER", Pattern.compile("[0-9]+")));
    pipeParam.add(new RegexMatches("REALNUMBER", Pattern.compile("[-0-9]+[.,]+[0-9.,]+")));
    pipeParam.add(new RegexMatches("HASDASH", Pattern.compile(".*-.*")));
    pipeParam.add(new RegexMatches("INITDASH", Pattern.compile("-.*")));
    pipeParam.add(new RegexMatches("ENDDASH", Pattern.compile(".*-")));
    pipeParam.add(
        new RegexMatches(
            "ALPHANUMERIC", Pattern.compile(".*[" + UNICODE_UPPER + UNICODE_LOWER + "].*[0-9].*")));
    pipeParam.add(
        new RegexMatches(
            "ALPHANUMERIC", Pattern.compile(".*[0-9].*[" + UNICODE_UPPER + UNICODE_LOWER + "].*")));
    pipeParam.add(new RegexMatches("IS_PUNCTUATION_MARK", Pattern.compile("[,.;:?!]")));

    pipeParam.add(new RegexMatches("IS_MINUSDASHSLASH", Pattern.compile("[-_/]")));

    // bio surface patterns
    if (fc.featureActive(featureConfig, "feat_bioregexp_enabled")) {
      pipeParam.add(new RegexMatches("ROMAN", Pattern.compile("[IVXDLCM]+")));
      pipeParam.add(new RegexMatches("HASROMAN", Pattern.compile(".*\\b[IVXDLCM]+\\b.*")));
      pipeParam.add(new RegexMatches("GREEK", Pattern.compile(GREEK)));
      pipeParam.add(new RegexMatches("HASGREEK", Pattern.compile(".*\\b" + GREEK + "\\b.*")));
    }

    // prefix and suffix
    final int[] prefixSizes = fc.getIntArray(featureConfig, "prefix_sizes");
    if (prefixSizes != null)
      for (final int prefixSize : prefixSizes)
        pipeParam.add(new TokenTextCharPrefix("PREFIX=", prefixSize));
    final int[] suffixSizes = fc.getIntArray(featureConfig, "suffix_sizes");
    if (suffixSizes != null)
      for (final int suffixSize : suffixSizes)
        pipeParam.add(new TokenTextCharSuffix("SUFFIX=", suffixSize));

    // lexicon membership
    for (final String key : fc.getLexiconKeys(featureConfig)) {
      final File lexFile = new File(featureConfig.getProperty(key));
      try {
        pipeParam.add(new LexiconMembership(key + "_membership", lexFile, true));
      } catch (final FileNotFoundException e) {
        e.printStackTrace();
      }
    }

    // offset conjunction
    final int[][] offset =
        fc.offsetConjFromConfig(featureConfig.getProperty("offset_conjunctions"));
    if (offset != null) pipeParam.add(new OffsetConjunctions(offset));

    // token ngrams
    final int[] tokenNGrams = fc.getIntArray(featureConfig, "token_ngrams");
    if (tokenNGrams != null) pipeParam.add(new TokenNGramPipe(tokenNGrams));

    // character ngrams
    final int[] charNGrams = fc.getIntArray(featureConfig, "char_ngrams");
    if (charNGrams != null) pipeParam.add(new TokenTextCharNGrams("CHAR_NGRAM=", charNGrams));

    // un-comment this for printing out the generated features
    // pipeParam.add(new PrintTokenSequenceFeatures());

    pipeParam.add(new TokenSequence2FeatureVectorSequence(true, true));

    final Pipe[] pipeParamArray = new Pipe[pipeParam.size()];
    pipeParam.toArray(pipeParamArray);
    final Pipe myPipe = new SerialPipes(pipeParamArray);

    // TODO; removed for mallet-2 as not needed
    // myPipe.setTargetAlphabet(dict);

    // now run data through pipes
    final InstanceList data = new InstanceList(myPipe);
    final SentencePipeIterator iterator = new SentencePipeIterator(sentences);
    data.addThruPipe(iterator);

    return data;
  }
Ejemplo n.º 14
0
  /**
   * Command-line wrapper to train, test, or run a generic CRF-based tagger.
   *
   * @param args the command line arguments. Options (shell and Java quoting should be added as
   *     needed):
   *     <dl>
   *       <dt><code>--help</code> <em>boolean</em>
   *       <dd>Print this command line option usage information. Give <code>true</code> for longer
   *           documentation. Default is <code>false</code>.
   *       <dt><code>--prefix-code</code> <em>Java-code</em>
   *       <dd>Java code you want run before any other interpreted code. Note that the text is
   *           interpreted without modification, so unlike some other Java code options, you need to
   *           include any necessary 'new's. Default is null.
   *       <dt><code>--gaussian-variance</code> <em>positive-number</em>
   *       <dd>The Gaussian prior variance used for training. Default is 10.0.
   *       <dt><code>--train</code> <em>boolean</em>
   *       <dd>Whether to train. Default is <code>false</code>.
   *       <dt><code>--iterations</code> <em>positive-integer</em>
   *       <dd>Number of training iterations. Default is 500.
   *       <dt><code>--test</code> <code>lab</code> or <code>seg=</code><em>start-1</em><code>.
   *           </code><em>continue-1</em><code>,</code>...<code>,</code><em>start-n</em><code>.
   *           </code><em>continue-n</em>
   *       <dd>Test measuring labeling or segmentation (<em>start-i</em>, <em>continue-i</em>)
   *           accuracy. Default is no testing.
   *       <dt><code>--training-proportion</code> <em>number-between-0-and-1</em>
   *       <dd>Fraction of data to use for training in a random split. Default is 0.5.
   *       <dt><code>--model-file</code> <em>filename</em>
   *       <dd>The filename for reading (train/run) or saving (train) the model. Default is null.
   *       <dt><code>--random-seed</code> <em>integer</em>
   *       <dd>The random seed for randomly selecting a proportion of the instance list for training
   *           Default is 0.
   *       <dt><code>--orders</code> <em>comma-separated-integers</em>
   *       <dd>List of label Markov orders (main and backoff) Default is 1.
   *       <dt><code>--forbidden</code> <em>regular-expression</em>
   *       <dd>If <em>label-1</em><code>,</code><em>label-2</em> matches the expression, the
   *           corresponding transition is forbidden. Default is <code>\\s</code> (nothing
   *           forbidden).
   *       <dt><code>--allowed</code> <em>regular-expression</em>
   *       <dd>If <em>label-1</em><code>,</code><em>label-2</em> does not match the expression, the
   *           corresponding expression is forbidden. Default is <code>.*</code> (everything
   *           allowed).
   *       <dt><code>--default-label</code> <em>string</em>
   *       <dd>Label for initial context and uninteresting tokens. Default is <code>O</code>.
   *       <dt><code>--viterbi-output</code> <em>boolean</em>
   *       <dd>Print Viterbi periodically during training. Default is <code>false</code>.
   *       <dt><code>--fully-connected</code> <em>boolean</em>
   *       <dd>Include all allowed transitions, even those not in training data. Default is <code>
   *           true</code>.
   *       <dt><code>--n-best</code> <em>positive-integer</em>
   *       <dd>Number of answers to output when applying model. Default is 1.
   *       <dt><code>--include-input</code> <em>boolean</em>
   *       <dd>Whether to include input features when printing decoding output. Default is <code>
   *           false</code>.
   *     </dl>
   *     Remaining arguments:
   *     <ul>
   *       <li><em>training-data-file</em> if training
   *       <li><em>training-and-test-data-file</em>, if training and testing with random split
   *       <li><em>training-data-file</em> <em>test-data-file</em> if training and testing from
   *           separate files
   *       <li><em>test-data-file</em> if testing
   *       <li><em>input-data-file</em> if applying to new data (unlabeled)
   *     </ul>
   *
   * @exception Exception if an error occurs
   */
  public static void main(String[] args) throws Exception {
    Reader trainingFile = null, testFile = null;
    InstanceList trainingData = null, testData = null;
    int numEvaluations = 0;
    int iterationsBetweenEvals = 16;
    int restArgs = commandOptions.processOptions(args);
    if (restArgs == args.length) {
      commandOptions.printUsage(true);
      throw new IllegalArgumentException("Missing data file(s)");
    }
    if (trainOption.value) {
      trainingFile = new FileReader(new File(args[restArgs]));
      if (testOption.value != null && restArgs < args.length - 1)
        testFile = new FileReader(new File(args[restArgs + 1]));
    } else testFile = new FileReader(new File(args[restArgs]));

    Pipe p = null;
    CRF crf = null;
    TransducerEvaluator eval = null;
    if (continueTrainingOption.value || !trainOption.value) {
      if (modelOption.value == null) {
        commandOptions.printUsage(true);
        throw new IllegalArgumentException("Missing model file option");
      }
      ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value));
      crf = (CRF) s.readObject();
      s.close();
      p = crf.getInputPipe();
    } else {
      p = new SimpleTaggerSentence2FeatureVectorSequence();
      p.getTargetAlphabet().lookupIndex(defaultOption.value);
    }

    if (trainOption.value) {
      p.setTargetProcessing(true);
      trainingData = new InstanceList(p);
      trainingData.addThruPipe(
          new LineGroupIterator(trainingFile, Pattern.compile("^\\s*$"), true));
      logger.info("Number of features in training data: " + p.getDataAlphabet().size());
      if (testOption.value != null) {
        if (testFile != null) {
          testData = new InstanceList(p);
          testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true));
        } else {
          Random r = new Random(randomSeedOption.value);
          InstanceList[] trainingLists =
              trainingData.split(
                  r, new double[] {trainingFractionOption.value, 1 - trainingFractionOption.value});
          trainingData = trainingLists[0];
          testData = trainingLists[1];
        }
      }
    } else if (testOption.value != null) {
      p.setTargetProcessing(true);
      testData = new InstanceList(p);
      testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true));
    } else {
      p.setTargetProcessing(false);
      testData = new InstanceList(p);
      testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true));
    }
    logger.info("Number of predicates: " + p.getDataAlphabet().size());

    if (testOption.value != null) {
      if (testOption.value.startsWith("lab"))
        eval =
            new TokenAccuracyEvaluator(
                new InstanceList[] {trainingData, testData}, new String[] {"Training", "Testing"});
      else if (testOption.value.startsWith("seg=")) {
        String[] pairs = testOption.value.substring(4).split(",");
        if (pairs.length < 1) {
          commandOptions.printUsage(true);
          throw new IllegalArgumentException(
              "Missing segment start/continue labels: " + testOption.value);
        }
        String startTags[] = new String[pairs.length];
        String continueTags[] = new String[pairs.length];
        for (int i = 0; i < pairs.length; i++) {
          String[] pair = pairs[i].split("\\.");
          if (pair.length != 2) {
            commandOptions.printUsage(true);
            throw new IllegalArgumentException(
                "Incorrectly-specified segment start and end labels: " + pairs[i]);
          }
          startTags[i] = pair[0];
          continueTags[i] = pair[1];
        }
        eval =
            new MultiSegmentationEvaluator(
                new InstanceList[] {trainingData, testData},
                new String[] {"Training", "Testing"},
                startTags,
                continueTags);
      } else {
        commandOptions.printUsage(true);
        throw new IllegalArgumentException("Invalid test option: " + testOption.value);
      }
    }

    if (p.isTargetProcessing()) {
      Alphabet targets = p.getTargetAlphabet();
      StringBuffer buf = new StringBuffer("Labels:");
      for (int i = 0; i < targets.size(); i++)
        buf.append(" ").append(targets.lookupObject(i).toString());
      logger.info(buf.toString());
    }
    if (trainOption.value) {
      crf =
          train(
              trainingData,
              testData,
              eval,
              ordersOption.value,
              defaultOption.value,
              forbiddenOption.value,
              allowedOption.value,
              connectedOption.value,
              iterationsOption.value,
              gaussianVarianceOption.value,
              crf);
      if (modelOption.value != null) {
        ObjectOutputStream s = new ObjectOutputStream(new FileOutputStream(modelOption.value));
        s.writeObject(crf);
        s.close();
      }
    } else {
      if (crf == null) {
        if (modelOption.value == null) {
          commandOptions.printUsage(true);
          throw new IllegalArgumentException("Missing model file option");
        }
        ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value));
        crf = (CRF) s.readObject();
        s.close();
      }
      if (eval != null) test(new NoopTransducerTrainer(crf), eval, testData);
      else {
        boolean includeInput = includeInputOption.value();
        for (int i = 0; i < testData.size(); i++) {
          Sequence input = (Sequence) testData.get(i).getData();
          Sequence[] outputs = apply(crf, input, nBestOption.value);
          int k = outputs.length;
          boolean error = false;
          for (int a = 0; a < k; a++) {
            if (outputs[a].size() != input.size()) {
              System.err.println("Failed to decode input sequence " + i + ", answer " + a);
              error = true;
            }
          }
          if (!error) {
            for (int j = 0; j < input.size(); j++) {
              StringBuffer buf = new StringBuffer();
              for (int a = 0; a < k; a++) buf.append(outputs[a].get(j).toString()).append(" ");
              if (includeInput) {
                FeatureVector fv = (FeatureVector) input.get(j);
                buf.append(fv.toString(true));
              }
              System.out.println(buf.toString());
            }
            System.out.println();
          }
        }
      }
    }
  }