Beispiel #1
0
  @Test
  public void testBrHasSpace() {
    Document doc = Jsoup.parse("<p>Hello<br>there</p>");
    assertEquals("Hello there", doc.text());
    assertEquals("Hello there", doc.select("p").first().ownText());

    doc = Jsoup.parse("<p>Hello <br> there</p>");
    assertEquals("Hello there", doc.text());
  }
  @Test
  public void testInterpretsASequenceOfTables() {
    tables =
        parse(
            "["
                + RuleForInterpreter.class.getName()
                + "]["
                + AlternateCalculator.class.getName()
                + "]\n"
                + "[a][b][sum?]\n"
                + "[6][2][8]\n"
                + "[5][2][8]\n"
                + "****\n"
                + "["
                + SetOfInterpreter.class.getName()
                + "]["
                + RowFixtureTarget.class.getName()
                + "]\n"
                + "[a][b][c]\n"
                + "[1][2][3]");

    Document document = Document.text(tables);
    execute(document);
    assertEquals(2, document.getStatistics().rightCount());
    assertEquals(3, document.getStatistics().wrongCount());
  }
 public List<String> extractCities(Document doc) {
   HashMap<String, String> cityMap = new HashMap<String, String>();
   cityMap.put("Adana", "Adana");
   cityMap.put("Konya", "Konya");
   cityMap.put("Tekirda\u011f", "Tekirda\u011f");
   // \u011f
   List<String> cityList = new ArrayList<String>();
   Element ilanDetay = doc.select("div#divIlanDetay").first();
   String patternJobTitle = ".*(\u015eehir/\u00dclke|City/Country|Location).*";
   Pattern pattern = Pattern.compile(patternJobTitle);
   Matcher matcher = pattern.matcher(getPlainText(ilanDetay));
   if (matcher.find()) {
     String cityLine = matcher.group();
     String[] cityLineArr = cityLine.split(":");
     if (cityLineArr.length > 1) {
       String cityCommaStr = cityLineArr[1].trim();
       String[] cityArr = cityCommaStr.split(" ")[0].split(",");
       for (String city : cityArr) {
         cityList.add(trim(city));
       }
     }
   }
   if (cityList.size() == 0) {
     Set<String> tokenSet = tokenize(doc.text());
     for (String s : tokenSet) {
       if (cityMap.containsKey(s)) {
         cityList.add(trim(cityMap.get(s)));
       }
     }
   }
   return cityList;
 }
Beispiel #4
0
 /**
  * performs the action, adding the specified Annotation. Returns the position of the end of the
  * Annotation.
  */
 @Override
 public int perform(Document doc, PatternApplication patap) {
   Span span;
   HashMap bindings = patap.bestBindings;
   // System.out.println ("bindings (for new annotation): " + bindings);
   if (spanVariable == null) {
     span = new Span(patap.startPosition, patap.bestPosition);
   } else if (spanVariable.name.toString() == "0") {
     span = new Span(patap.startPosition, patap.startPosition);
   } else {
     Object value = bindings.get(spanVariable.name);
     if (value instanceof Span) {
       span = (Span) value;
     } else if (value instanceof Annotation) {
       span = ((Annotation) value).span();
     } else {
       System.out.println("Value of " + spanVariable.toString() + " is not a span.or annotation");
       return -1;
     }
   }
   if (Pat.trace)
     Console.println(
         "Annotating "
             + doc.text(span)
             + " as "
             + type
             + " "
             + features.substitute(bindings).toSGMLString());
   hideAnnotations(doc, type, span);
   hideAnnotations(doc, "token", span);
   Annotation newAnnotation = new Annotation(type, span, features.substitute(bindings));
   doc.addAnnotation(newAnnotation);
   if (bindingVariable != null) bindings.put(bindingVariable.name, newAnnotation);
   return span.end();
 }
Beispiel #5
0
 @Test
 public void testKeepsPreTextInCode() {
   String h = "<pre><code>code\n\ncode</code></pre>";
   Document doc = Jsoup.parse(h);
   assertEquals("code\n\ncode", doc.text());
   assertEquals("<pre><code>code\n\ncode</code></pre>", doc.body().html());
 }
Beispiel #6
0
 @Test
 public void testNormalisesText() {
   String h = "<p>Hello<p>There.</p> \n <p>Here <b>is</b> \n s<b>om</b>e text.";
   Document doc = Jsoup.parse(h);
   String text = doc.text();
   assertEquals("Hello There. Here is some text.", text);
 }
  /**
   * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
   * tree structure <CODE>tree</CODE>.
   *
   * @param tree the parse tree (for a portion of Document doc)
   * @param doc the document
   * @param span the portion of doc covered by the parse tree
   * @param jetCategories if true, use Jet categories as terminal categories (if false, use
   *     categories read from parse trees)
   */
  public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) {
    List<ParseTreeNode> terminalNodes = getTerminalNodes(tree);
    String text = doc.text();
    int offset = span.start();

    for (ParseTreeNode terminal : terminalNodes) {
      while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
        offset++;
      }
      for (String skipString : skip) {
        if (text.startsWith(skipString, offset)) {
          offset += skipString.length();
          while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
            offset++;
          }
          break;
        }
      }
      // match next terminal node against next word in text
      int matchLength = matchTextToTree(text, offset, terminal.word);
      if (matchLength > 0) {
        int endOffset = offset + matchLength;
        while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
          endOffset++;
        }
        terminal.start = offset;
        terminal.end = endOffset;
        offset = endOffset;
      } else {
        System.err.println(
            "PTBReader.addAnnotations:  "
                + "Cannot determine parse tree offset for word "
                + terminal.word);
        System.err.println("  at document offset " + offset + " in sentence");
        System.err.println("  " + doc.text(span));
        return;
      }
    }

    if (jetCategories) {
      setJetAnnotations(tree, span, doc);
      StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<<
    } else {
      determineNonTerminalSpans(tree, span.start());
      setAnnotations(tree, doc);
    }
  }
Beispiel #8
0
  @Test
  public void testSetText() {
    String h = "<div id=1>Hello <p>there <b>now</b></p></div>";
    Document doc = Jsoup.parse(h);
    assertEquals("Hello there now", doc.text()); // need to sort out node whitespace
    assertEquals("there now", doc.select("p").get(0).text());

    Element div = doc.getElementById("1").text("Gone");
    assertEquals("Gone", div.text());
    assertEquals(0, doc.select("p").size());
  }
  /**
   * converts a set of Penn TreeBank files into text documents. Invoked by: PTBReader inputDir
   * outputDir. Converts all files with extension .mrg in inputDir to text documents, and writes
   * them into outputDir.
   */
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.out.println("usage: java " + PTBReader.class.getName() + " ");
      System.exit(1);
    }

    File inputDir = new File(args[0]);
    File outputDir = new File(args[1]);
    PTBReader parser = new PTBReader();
    for (File file : getFiles(new File(args[0]), ".mrg")) {
      String outFilename = removeSuffix(getRelativePath(inputDir, file));
      File outFile = new File(outputDir, outFilename);
      outFile.getParentFile().mkdirs();

      Writer out = new FileWriter(outFile);
      Document doc = parser.load(file).getDocument();
      out.write(doc.text());
      out.close();
    }
  }
 private Document document() {
   return Document.text(tables);
 }
Beispiel #11
0
 @Test
 public void testKeepsPreText() {
   String h = "<p>Hello \n \n there.</p> <div><pre>  What's \n\n  that?</pre>";
   Document doc = Jsoup.parse(h);
   assertEquals("Hello there.   What's \n\n  that?", doc.text());
 }
Beispiel #12
0
 @Test
 public void testGetText() {
   Document doc = Jsoup.parse(reference);
   assertEquals("Hello Another element", doc.text());
   assertEquals("Another element", doc.getElementsByTag("p").get(1).text());
 }