Exemplo n.º 1
0
 private static <T> Pair<List<T>, List<T>> splitData(List<T> original, double splitForSecond) {
   List<T> first = new ArrayList<>();
   List<T> second = new ArrayList<>();
   if (splitForSecond > 0.0) {
     Collections.shuffle(original, new Random(0L));
     int numFirst = (int) ((1.0 - splitForSecond) * original.size());
     first.addAll(original.subList(0, numFirst));
     second.addAll(original.subList(numFirst, original.size()));
   } else {
     first.addAll(original);
     // second stays empty
   }
   return Tuples.pair(first, second);
 }
Exemplo n.º 2
0
  @SneakyThrows
  public static void trainAndSaveModel(Opts opts) {
    // Load labeled data
    List<String> templateLines = linesFromPath(opts.templateFile).collect(toList());
    val predExtractor = ConllFormat.predicatesFromTemplate(templateLines.stream());
    List<List<Pair<ConllFormat.Row, String>>> labeledData =
        ConllFormat.readData(linesFromPath(opts.trainPath), true)
            .stream()
            .map(x -> x.stream().map(y -> y.asLabeledPair().swap()).collect(Collectors.toList()))
            .collect(Collectors.toList());

    // Split train/test data
    logger.info(
        "CRF training with {} threads and {} labeled examples",
        opts.numThreads,
        labeledData.size());
    val trainTestPair = splitData(labeledData, opts.testSplitRatio);
    List<List<Pair<ConllFormat.Row, String>>> trainLabeledData = trainTestPair.getOne();
    List<List<Pair<ConllFormat.Row, String>>> testLabeledData = trainTestPair.getTwo();

    // Set up Train options
    CRFTrainer.Opts trainOpts = new CRFTrainer.Opts();
    trainOpts.sigmaSq = opts.sigmaSquared;
    trainOpts.lbfgsHistorySize = opts.lbfgsHistorySize;
    trainOpts.minExpectedFeatureCount = (int) (1.0 / opts.featureKeepProb);
    trainOpts.numThreads = opts.numThreads;

    // Trainer
    CRFTrainer<String, ConllFormat.Row, String> trainer =
        new CRFTrainer<>(trainLabeledData, predExtractor, trainOpts);

    // Setup iteration callback, weird trick here where you require
    // the trainer to make a model for each iteration but then need
    // to modify the iteration-callback to use it
    Parallel.MROpts evalMrOpts =
        Parallel.MROpts.withIdAndThreads("mr-crf-train-eval", opts.numThreads);
    trainOpts.optimizerOpts.iterCallback =
        (weights) -> {
          CRFModel<String, ConllFormat.Row, String> crfModel = trainer.modelForWeights(weights);
          long start = System.currentTimeMillis();
          List<List<Pair<String, ConllFormat.Row>>> trainEvalData =
              trainLabeledData
                  .stream()
                  .map(x -> x.stream().map(Pair::swap).collect(toList()))
                  .collect(toList());

          Evaluation<String> eval = Evaluation.compute(crfModel, trainEvalData, evalMrOpts);
          long stop = System.currentTimeMillis();
          logger.info(
              "Train Accuracy: {} (took {} ms)", eval.tokenAccuracy.accuracy(), stop - start);
          if (!testLabeledData.isEmpty()) {
            start = System.currentTimeMillis();
            List<List<Pair<String, ConllFormat.Row>>> testEvalData =
                testLabeledData
                    .stream()
                    .map(x -> x.stream().map(Pair::swap).collect(toList()))
                    .collect(toList());
            eval = Evaluation.compute(crfModel, testEvalData, evalMrOpts);
            stop = System.currentTimeMillis();
            logger.info(
                "Test Accuracy: {} (took {} ms)", eval.tokenAccuracy.accuracy(), stop - start);
          }
        };

    CRFModel<String, ConllFormat.Row, String> crfModel = trainer.train(trainLabeledData);
    Parallel.shutdownExecutor(evalMrOpts.executorService, Long.MAX_VALUE);
    Vector weights = crfModel.weights();
    val dos = new DataOutputStream(new FileOutputStream(opts.modelPath));
    logger.info("Writing model to {}", opts.modelPath);
    ConllFormat.saveModel(dos, templateLines, crfModel.featureEncoder, weights);
  }