Java Corpus 예제들

프로그래밍 언어: Java

네임스페이스/패키지 이름: java.util

클래스/타입: Corpus

hotexamples.com에서의 예제들: 2

Java Corpus - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Java의 java.util.Corpus에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

add(1)

binarizeAndFilterTrees(1)

getTrainTrees(1)

getValidationTrees(1)

iterator(1)

lowercaseWords(1)

replaceRareWords(1)

예제 #1

파일 보기

파일: GrammarTrainer.java 프로젝트: ashish01/ssie-versioned

  public static void main(String[] args) {
    OptionParser optParser = new OptionParser(Options.class);
    Options opts = (Options) optParser.parse(args, true);
    // provide feedback on command-line arguments
    System.out.println("Calling with " + optParser.getPassedInOptions());

    String path = opts.path;
    //    int lang = opts.lang;
    System.out.println("Loading trees from " + path + " and using language " + opts.treebank);

    double trainingFractionToKeep = opts.trainingFractionToKeep;

    int maxSentenceLength = opts.maxSentenceLength;
    System.out.println("Will remove sentences with more than " + maxSentenceLength + " words.");

    HORIZONTAL_MARKOVIZATION = opts.horizontalMarkovization;
    VERTICAL_MARKOVIZATION = opts.verticalMarkovization;
    System.out.println(
        "Using horizontal="
            + HORIZONTAL_MARKOVIZATION
            + " and vertical="
            + VERTICAL_MARKOVIZATION
            + " markovization.");

    Binarization binarization = opts.binarization;
    System.out.println(
        "Using " + binarization.name() + " binarization."); // and "+annotateString+".");

    double randomness = opts.randomization;
    System.out.println("Using a randomness value of " + randomness);

    String outFileName = opts.outFileName;
    if (outFileName == null) {
      System.out.println("Output File name is required.");
      System.exit(-1);
    } else System.out.println("Using grammar output file " + outFileName + ".");

    VERBOSE = opts.verbose;
    RANDOM = new Random(opts.randSeed);
    System.out.println("Random number generator seeded at " + opts.randSeed + ".");

    boolean manualAnnotation = false;
    boolean baseline = opts.baseline;
    boolean noSplit = opts.noSplit;
    int numSplitTimes = opts.numSplits;
    if (baseline) numSplitTimes = 0;
    String splitGrammarFile = opts.inFile;
    int allowedDroppingIters = opts.di;

    int maxIterations = opts.splitMaxIterations;
    int minIterations = opts.splitMinIterations;
    if (minIterations > 0)
      System.out.println("I will do at least " + minIterations + " iterations.");

    double[] smoothParams = {opts.smoothingParameter1, opts.smoothingParameter2};
    System.out.println("Using smoothing parameters " + smoothParams[0] + " and " + smoothParams[1]);

    boolean allowMoreSubstatesThanCounts = false;
    boolean findClosedUnaryPaths = opts.findClosedUnaryPaths;

    Corpus corpus =
        new Corpus(
            path,
            opts.treebank,
            trainingFractionToKeep,
            false,
            opts.skipSection,
            opts.skipBilingual);
    List<Tree<String>> trainTrees =
        Corpus.binarizeAndFilterTrees(
            corpus.getTrainTrees(),
            VERTICAL_MARKOVIZATION,
            HORIZONTAL_MARKOVIZATION,
            maxSentenceLength,
            binarization,
            manualAnnotation,
            VERBOSE);
    List<Tree<String>> validationTrees =
        Corpus.binarizeAndFilterTrees(
            corpus.getValidationTrees(),
            VERTICAL_MARKOVIZATION,
            HORIZONTAL_MARKOVIZATION,
            maxSentenceLength,
            binarization,
            manualAnnotation,
            VERBOSE);
    Numberer tagNumberer = Numberer.getGlobalNumberer("tags");

    //		for (Tree<String> t : trainTrees){
    //				System.out.println(t);
    //			}

    if (opts.trainOnDevSet) {
      System.out.println("Adding devSet to training data.");
      trainTrees.addAll(validationTrees);
    }

    if (opts.lowercase) {
      System.out.println("Lowercasing the treebank.");
      Corpus.lowercaseWords(trainTrees);
      Corpus.lowercaseWords(validationTrees);
    }

    int nTrees = trainTrees.size();

    System.out.println("There are " + nTrees + " trees in the training set.");

    double filter = opts.filter;
    if (filter > 0)
      System.out.println(
          "Will remove rules with prob under "
              + filter
              + ".\nEven though only unlikely rules are pruned the training LL is not guaranteed to increase in every round anymore "
              + "(especially when we are close to converging)."
              + "\nFurthermore it increases the variance because 'good' rules can be pruned away in early stages.");

    short nSubstates = opts.nSubStates;
    short[] numSubStatesArray =
        initializeSubStateArray(trainTrees, validationTrees, tagNumberer, nSubstates);
    if (baseline) {
      short one = 1;
      Arrays.fill(numSubStatesArray, one);
      System.out.println("Training just the baseline grammar (1 substate for all states)");
      randomness = 0.0f;
    }

    if (VERBOSE) {
      for (int i = 0; i < numSubStatesArray.length; i++) {
        System.out.println("Tag " + (String) tagNumberer.object(i) + " " + i);
      }
    }

    System.out.println("There are " + numSubStatesArray.length + " observed categories.");

    // initialize lexicon and grammar
    Lexicon lexicon = null, maxLexicon = null, previousLexicon = null;
    Grammar grammar = null, maxGrammar = null, previousGrammar = null;
    double maxLikelihood = Double.NEGATIVE_INFINITY;

    //    String smootherStr = opts.smooth;
    //    Smoother lexiconSmoother = null;
    //    Smoother grammarSmoother = null;
    //    if (splitGrammarFile!=null){
    //    	lexiconSmoother = maxLexicon.smoother;
    //    	grammarSmoother = maxGrammar.smoother;
    //    	System.out.println("Using smoother from input grammar.");
    //    }
    //    else if (smootherStr.equals("NoSmoothing"))
    //    	lexiconSmoother = grammarSmoother = new NoSmoothing();
    //    else if (smootherStr.equals("SmoothAcrossParentBits")) {
    //    	lexiconSmoother = grammarSmoother  = new SmoothAcrossParentBits(grammarSmoothing,
    // maxGrammar.splitTrees);
    //    }
    //    else
    //    	throw new Error("I didn't understand the type of smoother '"+smootherStr+"'");
    //    System.out.println("Using smoother "+smootherStr);

    // EM: iterate until the validation likelihood drops for four consecutive
    // iterations
    int iter = 0;
    int droppingIter = 0;

    //  If we are splitting, we load the old grammar and start off by splitting.
    int startSplit = 0;
    if (splitGrammarFile != null) {
      System.out.println("Loading old grammar from " + splitGrammarFile);
      startSplit = 1; // we've already trained the grammar
      ParserData pData = ParserData.Load(splitGrammarFile);
      maxGrammar = pData.gr;
      maxLexicon = pData.lex;
      numSubStatesArray = maxGrammar.numSubStates;
      previousGrammar = grammar = maxGrammar;
      previousLexicon = lexicon = maxLexicon;
      Numberer.setNumberers(pData.getNumbs());
      tagNumberer = Numberer.getGlobalNumberer("tags");
      System.out.println("Loading old grammar complete.");
      if (noSplit) {
        System.out.println("Will NOT split the loaded grammar.");
        startSplit = 0;
      }
    }

    double mergingPercentage = opts.mergingPercentage;
    boolean separateMergingThreshold = opts.separateMergingThreshold;
    if (mergingPercentage > 0) {
      System.out.println(
          "Will merge " + (int) (mergingPercentage * 100) + "% of the splits in each round.");
      System.out.println(
          "The threshold for merging lexical and phrasal categories will be set separately: "
              + separateMergingThreshold);
    }

    StateSetTreeList trainStateSetTrees =
        new StateSetTreeList(trainTrees, numSubStatesArray, false, tagNumberer);
    StateSetTreeList validationStateSetTrees =
        new StateSetTreeList(validationTrees, numSubStatesArray, false, tagNumberer); // deletePC);

    // get rid of the old trees
    trainTrees = null;
    validationTrees = null;
    corpus = null;
    System.gc();

    if (opts.simpleLexicon) {
      System.out.println(
          "Replacing words which have been seen less than 5 times with their signature.");
      Corpus.replaceRareWords(
          trainStateSetTrees, new SimpleLexicon(numSubStatesArray, -1), opts.rare);
    }

    // If we're training without loading a split grammar, then we run once without splitting.
    if (splitGrammarFile == null) {
      grammar =
          new Grammar(numSubStatesArray, findClosedUnaryPaths, new NoSmoothing(), null, filter);
      Lexicon tmp_lexicon =
          (opts.simpleLexicon)
              ? new SimpleLexicon(
                  numSubStatesArray,
                  -1,
                  smoothParams,
                  new NoSmoothing(),
                  filter,
                  trainStateSetTrees)
              : new SophisticatedLexicon(
                  numSubStatesArray,
                  SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                  smoothParams,
                  new NoSmoothing(),
                  filter);
      int n = 0;
      boolean secondHalf = false;
      for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
        secondHalf = (n++ > nTrees / 2.0);
        tmp_lexicon.trainTree(stateSetTree, randomness, null, secondHalf, false, opts.rare);
      }
      lexicon =
          (opts.simpleLexicon)
              ? new SimpleLexicon(
                  numSubStatesArray,
                  -1,
                  smoothParams,
                  new NoSmoothing(),
                  filter,
                  trainStateSetTrees)
              : new SophisticatedLexicon(
                  numSubStatesArray,
                  SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                  smoothParams,
                  new NoSmoothing(),
                  filter);
      for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
        secondHalf = (n++ > nTrees / 2.0);
        lexicon.trainTree(stateSetTree, randomness, tmp_lexicon, secondHalf, false, opts.rare);
        grammar.tallyUninitializedStateSetTree(stateSetTree);
      }
      lexicon.tieRareWordStats(opts.rare);
      lexicon.optimize();

      // SSIE
      ((SophisticatedLexicon) lexicon).overwriteWithMaxent();

      grammar.optimize(randomness);
      // System.out.println(grammar);
      previousGrammar = maxGrammar = grammar; // needed for baseline - when there is no EM loop
      previousLexicon = maxLexicon = lexicon;
    }

    // the main loop: split and train the grammar
    for (int splitIndex = startSplit; splitIndex < numSplitTimes * 3; splitIndex++) {

      // now do either a merge or a split and the end a smooth
      // on odd iterations merge, on even iterations split
      String opString = "";
      if (splitIndex % 3 == 2) { // (splitIndex==numSplitTimes*2){
        if (opts.smooth.equals("NoSmoothing")) continue;
        System.out.println("Setting smoother for grammar and lexicon.");
        Smoother grSmoother = new SmoothAcrossParentBits(0.01, maxGrammar.splitTrees);
        Smoother lexSmoother = new SmoothAcrossParentBits(0.1, maxGrammar.splitTrees);
        //        Smoother grSmoother = new SmoothAcrossParentSubstate(0.01);
        //        Smoother lexSmoother = new SmoothAcrossParentSubstate(0.1);
        maxGrammar.setSmoother(grSmoother);
        maxLexicon.setSmoother(lexSmoother);
        minIterations = maxIterations = opts.smoothMaxIterations;
        opString = "smoothing";
      } else if (splitIndex % 3 == 0) {
        // the case where we split
        if (opts.noSplit) continue;
        System.out.println(
            "Before splitting, we have a total of " + maxGrammar.totalSubStates() + " substates.");
        CorpusStatistics corpusStatistics = new CorpusStatistics(tagNumberer, trainStateSetTrees);
        int[] counts = corpusStatistics.getSymbolCounts();

        maxGrammar = maxGrammar.splitAllStates(randomness, counts, allowMoreSubstatesThanCounts, 0);
        maxLexicon = maxLexicon.splitAllStates(counts, allowMoreSubstatesThanCounts, 0);
        Smoother grSmoother = new NoSmoothing();
        Smoother lexSmoother = new NoSmoothing();
        maxGrammar.setSmoother(grSmoother);
        maxLexicon.setSmoother(lexSmoother);
        System.out.println(
            "After splitting, we have a total of " + maxGrammar.totalSubStates() + " substates.");
        System.out.println(
            "Rule probabilities are NOT normalized in the split, therefore the training LL is not guaranteed to improve between iteration 0 and 1!");
        opString = "splitting";
        maxIterations = opts.splitMaxIterations;
        minIterations = opts.splitMinIterations;
      } else {
        if (mergingPercentage == 0) continue;
        // the case where we merge
        double[][] mergeWeights =
            GrammarMerger.computeMergeWeights(maxGrammar, maxLexicon, trainStateSetTrees);
        double[][][] deltas =
            GrammarMerger.computeDeltas(maxGrammar, maxLexicon, mergeWeights, trainStateSetTrees);
        boolean[][][] mergeThesePairs =
            GrammarMerger.determineMergePairs(
                deltas, separateMergingThreshold, mergingPercentage, maxGrammar);

        grammar = GrammarMerger.doTheMerges(maxGrammar, maxLexicon, mergeThesePairs, mergeWeights);
        short[] newNumSubStatesArray = grammar.numSubStates;
        trainStateSetTrees = new StateSetTreeList(trainStateSetTrees, newNumSubStatesArray, false);
        validationStateSetTrees =
            new StateSetTreeList(validationStateSetTrees, newNumSubStatesArray, false);

        // retrain lexicon to finish the lexicon merge (updates the unknown words model)...
        lexicon =
            (opts.simpleLexicon)
                ? new SimpleLexicon(
                    newNumSubStatesArray,
                    -1,
                    smoothParams,
                    maxLexicon.getSmoother(),
                    filter,
                    trainStateSetTrees)
                : new SophisticatedLexicon(
                    newNumSubStatesArray,
                    SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                    maxLexicon.getSmoothingParams(),
                    maxLexicon.getSmoother(),
                    maxLexicon.getPruningThreshold());
        boolean updateOnlyLexicon = true;
        double trainingLikelihood =
            GrammarTrainer.doOneEStep(
                grammar,
                maxLexicon,
                null,
                lexicon,
                trainStateSetTrees,
                updateOnlyLexicon,
                opts.rare);
        //    		System.out.println("The training LL is "+trainingLikelihood);
        lexicon
            .optimize(); // Grammar.RandomInitializationType.INITIALIZE_WITH_SMALL_RANDOMIZATION);
                         // // M Step

        GrammarMerger.printMergingStatistics(maxGrammar, grammar);
        opString = "merging";
        maxGrammar = grammar;
        maxLexicon = lexicon;
        maxIterations = opts.mergeMaxIterations;
        minIterations = opts.mergeMinIterations;
      }
      // update the substate dependent objects
      previousGrammar = grammar = maxGrammar;
      previousLexicon = lexicon = maxLexicon;
      droppingIter = 0;
      numSubStatesArray = grammar.numSubStates;
      trainStateSetTrees = new StateSetTreeList(trainStateSetTrees, numSubStatesArray, false);
      validationStateSetTrees =
          new StateSetTreeList(validationStateSetTrees, numSubStatesArray, false);
      maxLikelihood = calculateLogLikelihood(maxGrammar, maxLexicon, validationStateSetTrees);
      System.out.println(
          "After "
              + opString
              + " in the "
              + (splitIndex / 3 + 1)
              + "th round, we get a validation likelihood of "
              + maxLikelihood);
      iter = 0;

      // the inner loop: train the grammar via EM until validation likelihood reliably drops
      do {
        iter += 1;
        System.out.println("Beginning iteration " + (iter - 1) + ":");

        // 1) Compute the validation likelihood of the previous iteration
        System.out.print("Calculating validation likelihood...");
        double validationLikelihood =
            calculateLogLikelihood(
                previousGrammar,
                previousLexicon,
                validationStateSetTrees); // The validation LL of previousGrammar/previousLexicon
        System.out.println("done: " + validationLikelihood);

        // 2) Perform the E step while computing the training likelihood of the previous iteration
        System.out.print("Calculating training likelihood...");
        grammar =
            new Grammar(
                grammar.numSubStates,
                grammar.findClosedPaths,
                grammar.smoother,
                grammar,
                grammar.threshold);
        lexicon =
            (opts.simpleLexicon)
                ? new SimpleLexicon(
                    grammar.numSubStates,
                    -1,
                    smoothParams,
                    lexicon.getSmoother(),
                    filter,
                    trainStateSetTrees)
                : new SophisticatedLexicon(
                    grammar.numSubStates,
                    SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                    lexicon.getSmoothingParams(),
                    lexicon.getSmoother(),
                    lexicon.getPruningThreshold());
        boolean updateOnlyLexicon = false;
        double trainingLikelihood =
            doOneEStep(
                previousGrammar,
                previousLexicon,
                grammar,
                lexicon,
                trainStateSetTrees,
                updateOnlyLexicon,
                opts.rare); // The training LL of previousGrammar/previousLexicon
        System.out.println("done: " + trainingLikelihood);

        // 3) Perform the M-Step
        lexicon.optimize(); // M Step
        grammar.optimize(0); // M Step

        // 4) Check whether previousGrammar/previousLexicon was in fact better than the best
        if (iter < minIterations || validationLikelihood >= maxLikelihood) {
          maxLikelihood = validationLikelihood;
          maxGrammar = previousGrammar;
          maxLexicon = previousLexicon;
          droppingIter = 0;
        } else {
          droppingIter++;
        }

        // 5) advance the 'pointers'
        previousGrammar = grammar;
        previousLexicon = lexicon;
      } while ((droppingIter < allowedDroppingIters) && (!baseline) && (iter < maxIterations));

      // Dump a grammar file to disk from time to time
      ParserData pData =
          new ParserData(
              maxLexicon,
              maxGrammar,
              null,
              Numberer.getNumberers(),
              numSubStatesArray,
              VERTICAL_MARKOVIZATION,
              HORIZONTAL_MARKOVIZATION,
              binarization);
      String outTmpName = outFileName + "_" + (splitIndex / 3 + 1) + "_" + opString + ".gr";
      System.out.println("Saving grammar to " + outTmpName + ".");
      if (pData.Save(outTmpName)) System.out.println("Saving successful.");
      else System.out.println("Saving failed!");
      pData = null;
    }

    // The last grammar/lexicon has not yet been evaluated. Even though the validation likelihood
    // has been dropping in the past few iteration, there is still a chance that the last one was in
    // fact the best so just in case we evaluate it.
    System.out.print("Calculating last validation likelihood...");
    double validationLikelihood = calculateLogLikelihood(grammar, lexicon, validationStateSetTrees);
    System.out.println(
        "done.\n  Iteration "
            + iter
            + " (final) gives validation likelihood "
            + validationLikelihood);
    if (validationLikelihood > maxLikelihood) {
      maxLikelihood = validationLikelihood;
      maxGrammar = previousGrammar;
      maxLexicon = previousLexicon;
    }

    ParserData pData =
        new ParserData(
            maxLexicon,
            maxGrammar,
            null,
            Numberer.getNumberers(),
            numSubStatesArray,
            VERTICAL_MARKOVIZATION,
            HORIZONTAL_MARKOVIZATION,
            binarization);

    System.out.println("Saving grammar to " + outFileName + ".");
    System.out.println("It gives a validation data log likelihood of: " + maxLikelihood);
    if (pData.Save(outFileName)) System.out.println("Saving successful.");
    else System.out.println("Saving failed!");

    System.exit(0);
  }

예제 #2

파일 보기

파일: StandAloneAnnie.java 프로젝트: ecohealthalliance/grits-gate

  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main