private static String getMetaValue(Source source, String key) {
   for (int pos = 0; pos < source.length(); ) {
     StartTag startTag = source.getNextStartTag(pos, "name", key, false);
     if (startTag == null) return null;
     if (startTag.getName() == HTMLElementName.META)
       return startTag.getAttributeValue("content"); // Attribute values are automatically decoded
     pos = startTag.getEnd();
   }
   return null;
 }
 public void writeTo(final Writer writer) throws IOException {
   this.writer = writer;
   if (segment instanceof Source) ((Source) segment).fullSequentialParse();
   nextTag = segment.source.findNextTag(segment.begin);
   index = segment.begin;
   writeContent(segment.end, segment.getChildElements(), 0);
   writer.flush();
 }
Beispiel #3
0
 public static void main(String[] args) throws Exception {
   String sourceUrlString = "data/test.html";
   if (args.length == 0)
     System.err.println("Using default argument of \"" + sourceUrlString + '"');
   else sourceUrlString = args[0];
   if (sourceUrlString.indexOf(':') == -1) sourceUrlString = "file:" + sourceUrlString;
   System.out.println("\nSource URL:");
   System.out.println(sourceUrlString);
   URL url = new URL(sourceUrlString);
   Source source = new Source(url);
   System.out.println("\nDocument Title:");
   Element titleElement = source.getFirstElement(HTMLElementName.TITLE);
   System.out.println(titleElement != null ? titleElement.getContent().toString() : "(none)");
   System.out.println("\nSource.getEncoding():");
   System.out.println(source.getEncoding());
   System.out.println("\nSource.getEncodingSpecificationInfo():");
   System.out.println(source.getEncodingSpecificationInfo());
   System.out.println("\nSource.getPreliminaryEncodingInfo():");
   System.out.println(source.getPreliminaryEncodingInfo());
 }
 private static String getTitle(Source source) {
   Element titleElement = source.getFirstElement(HTMLElementName.TITLE);
   if (titleElement == null) return null;
   // TITLE element never contains other tags so just decode it collapsing whitespace:
   return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
 }
  public static void main(String[] args) throws Exception {
    String sourceUrlString = null;
    PrintWriter writer = null;
    File Directory = new File("asd\\Web Pages\\HousingAndEnvironment");
    if (Directory.isDirectory()) {
      for (File f : Directory.listFiles()) {
        sourceUrlString = Directory.getPath() + "\\" + f.getName();
        if (sourceUrlString.indexOf(':') == -1) sourceUrlString = "file:" + sourceUrlString;
        // reader = new BufferedReader(new FileReader(sourceUrlString));
        writer = new PrintWriter("asd\\Web Pages\\HousingAndEnvironment" + f.getName() + ".txt");
        Source source = new Source(new URL(sourceUrlString));
        source.fullSequentialParse();
        writer.print(source.getTextExtractor().setIncludeAttributes(true).toString());
        writer.flush();
        writer.close();

        //		MicrosoftConditionalCommentTagTypes.register();
        //		PHPTagTypes.register();
        //		PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise
        // they override processing instructions
        //		MasonTagTypes.register();

        // Call fullSequentialParse manually as most of the source will be parsed.

        //		System.out.println("Document title:");
        //		String title=getTitle(source);
        //		System.out.println(title==null ? "(none)" : title);

        //		System.out.println("\nDocument description:");
        //		String description=getMetaValue(source,"description");
        //		System.out.println(description==null ? "(none)" : description);
        //
        //		System.out.println("\nDocument keywords:");
        //		String keywords=getMetaValue(source,"keywords");
        //		System.out.println(keywords==null ? "(none)" : keywords);
        //
        //		System.out.println("\nLinks to other documents:");
        //		List<Element> linkElements=source.getAllElements(HTMLElementName.A);
        //		for (Element linkElement : linkElements) {
        //			String href=linkElement.getAttributeValue("href");
        //			if (href==null) continue;
        //			// A element can contain other tags so need to extract the text from it:
        //			String label=linkElement.getContent().getTextExtractor().toString();
        //			System.out.println(label+" <"+href+'>');
        //		}

        //		System.out.println("\nAll text from file (exluding content inside SCRIPT and STYLE
        // elements):\n");
        //		System.out.println(source.getTextExtractor().setIncludeAttributes(true).toString());

        // System.out.println("\nSame again but this time extend the TextExtractor class to also
        // exclude text from P elements and any elements with class=\"control\":\n");
        //		TextExtractor textExtractor=new TextExtractor(source) {
        //			public boolean excludeElement(StartTag startTag) {
        //				return startTag.getName()==HTMLElementName.P ||
        // "control".equalsIgnoreCase(startTag.getAttributeValue("class"));
        //			}
        //		};
        //		System.out.println(textExtractor.setIncludeAttributes(true).toString());
      }
    }
  }