@Test public void testBrHasSpace() { Document doc = Jsoup.parse("<p>Hello<br>there</p>"); assertEquals("Hello there", doc.text()); assertEquals("Hello there", doc.select("p").first().ownText()); doc = Jsoup.parse("<p>Hello <br> there</p>"); assertEquals("Hello there", doc.text()); }
@Test public void testInterpretsASequenceOfTables() { tables = parse( "[" + RuleForInterpreter.class.getName() + "][" + AlternateCalculator.class.getName() + "]\n" + "[a][b][sum?]\n" + "[6][2][8]\n" + "[5][2][8]\n" + "****\n" + "[" + SetOfInterpreter.class.getName() + "][" + RowFixtureTarget.class.getName() + "]\n" + "[a][b][c]\n" + "[1][2][3]"); Document document = Document.text(tables); execute(document); assertEquals(2, document.getStatistics().rightCount()); assertEquals(3, document.getStatistics().wrongCount()); }
public List<String> extractCities(Document doc) { HashMap<String, String> cityMap = new HashMap<String, String>(); cityMap.put("Adana", "Adana"); cityMap.put("Konya", "Konya"); cityMap.put("Tekirda\u011f", "Tekirda\u011f"); // \u011f List<String> cityList = new ArrayList<String>(); Element ilanDetay = doc.select("div#divIlanDetay").first(); String patternJobTitle = ".*(\u015eehir/\u00dclke|City/Country|Location).*"; Pattern pattern = Pattern.compile(patternJobTitle); Matcher matcher = pattern.matcher(getPlainText(ilanDetay)); if (matcher.find()) { String cityLine = matcher.group(); String[] cityLineArr = cityLine.split(":"); if (cityLineArr.length > 1) { String cityCommaStr = cityLineArr[1].trim(); String[] cityArr = cityCommaStr.split(" ")[0].split(","); for (String city : cityArr) { cityList.add(trim(city)); } } } if (cityList.size() == 0) { Set<String> tokenSet = tokenize(doc.text()); for (String s : tokenSet) { if (cityMap.containsKey(s)) { cityList.add(trim(cityMap.get(s))); } } } return cityList; }
/** * performs the action, adding the specified Annotation. Returns the position of the end of the * Annotation. */ @Override public int perform(Document doc, PatternApplication patap) { Span span; HashMap bindings = patap.bestBindings; // System.out.println ("bindings (for new annotation): " + bindings); if (spanVariable == null) { span = new Span(patap.startPosition, patap.bestPosition); } else if (spanVariable.name.toString() == "0") { span = new Span(patap.startPosition, patap.startPosition); } else { Object value = bindings.get(spanVariable.name); if (value instanceof Span) { span = (Span) value; } else if (value instanceof Annotation) { span = ((Annotation) value).span(); } else { System.out.println("Value of " + spanVariable.toString() + " is not a span.or annotation"); return -1; } } if (Pat.trace) Console.println( "Annotating " + doc.text(span) + " as " + type + " " + features.substitute(bindings).toSGMLString()); hideAnnotations(doc, type, span); hideAnnotations(doc, "token", span); Annotation newAnnotation = new Annotation(type, span, features.substitute(bindings)); doc.addAnnotation(newAnnotation); if (bindingVariable != null) bindings.put(bindingVariable.name, newAnnotation); return span.end(); }
@Test public void testKeepsPreTextInCode() { String h = "<pre><code>code\n\ncode</code></pre>"; Document doc = Jsoup.parse(h); assertEquals("code\n\ncode", doc.text()); assertEquals("<pre><code>code\n\ncode</code></pre>", doc.body().html()); }
@Test public void testNormalisesText() { String h = "<p>Hello<p>There.</p> \n <p>Here <b>is</b> \n s<b>om</b>e text."; Document doc = Jsoup.parse(h); String text = doc.text(); assertEquals("Hello There. Here is some text.", text); }
/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure <CODE>tree</CODE>. * * @param tree the parse tree (for a portion of Document doc) * @param doc the document * @param span the portion of doc covered by the parse tree * @param jetCategories if true, use Jet categories as terminal categories (if false, use * categories read from parse trees) */ public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) { List<ParseTreeNode> terminalNodes = getTerminalNodes(tree); String text = doc.text(); int offset = span.start(); for (ParseTreeNode terminal : terminalNodes) { while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } for (String skipString : skip) { if (text.startsWith(skipString, offset)) { offset += skipString.length(); while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } break; } } // match next terminal node against next word in text int matchLength = matchTextToTree(text, offset, terminal.word); if (matchLength > 0) { int endOffset = offset + matchLength; while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) { endOffset++; } terminal.start = offset; terminal.end = endOffset; offset = endOffset; } else { System.err.println( "PTBReader.addAnnotations: " + "Cannot determine parse tree offset for word " + terminal.word); System.err.println(" at document offset " + offset + " in sentence"); System.err.println(" " + doc.text(span)); return; } } if (jetCategories) { setJetAnnotations(tree, span, doc); StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<< } else { determineNonTerminalSpans(tree, span.start()); setAnnotations(tree, doc); } }
@Test public void testSetText() { String h = "<div id=1>Hello <p>there <b>now</b></p></div>"; Document doc = Jsoup.parse(h); assertEquals("Hello there now", doc.text()); // need to sort out node whitespace assertEquals("there now", doc.select("p").get(0).text()); Element div = doc.getElementById("1").text("Gone"); assertEquals("Gone", div.text()); assertEquals(0, doc.select("p").size()); }
/** * converts a set of Penn TreeBank files into text documents. Invoked by: PTBReader inputDir * outputDir. Converts all files with extension .mrg in inputDir to text documents, and writes * them into outputDir. */ public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage: java " + PTBReader.class.getName() + " "); System.exit(1); } File inputDir = new File(args[0]); File outputDir = new File(args[1]); PTBReader parser = new PTBReader(); for (File file : getFiles(new File(args[0]), ".mrg")) { String outFilename = removeSuffix(getRelativePath(inputDir, file)); File outFile = new File(outputDir, outFilename); outFile.getParentFile().mkdirs(); Writer out = new FileWriter(outFile); Document doc = parser.load(file).getDocument(); out.write(doc.text()); out.close(); } }
private Document document() { return Document.text(tables); }
@Test public void testKeepsPreText() { String h = "<p>Hello \n \n there.</p> <div><pre> What's \n\n that?</pre>"; Document doc = Jsoup.parse(h); assertEquals("Hello there. What's \n\n that?", doc.text()); }
@Test public void testGetText() { Document doc = Jsoup.parse(reference); assertEquals("Hello Another element", doc.text()); assertEquals("Another element", doc.getElementsByTag("p").get(1).text()); }