Esempi in Java per Source.fullSequentialParse

Linguaggio di programmazione: Java

Spazio dei nomi/nome del pacchetto: net.htmlparser.jericho

Classe/tipologia: Source

Metodo/funzione: fullSequentialParse

Esempi su hotexamples.com: 5

Source.fullSequentialParse in Java: 5 esempi trovati. Questi sono i migliori esempi reali in Java per net.htmlparser.jericho.Source.fullSequentialParse, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

getAllElements(9)

getTextExtractor(6)

fullSequentialParse(5)

getAllElementsByClass(4)

getFirstElement(4)

getAllStartTags(3)

getRenderer(2)

getElementById(2)

setLogger(2)

length(1)

isXML(1)

getNextElement(1)

getPreliminaryEncodingInfo(1)

getNextStartTag(1)

getFormFields(1)

getEncodingSpecificationInfo(1)

getEncoding(1)

toString(1)

Esempio n. 1

Mostra file

File: DefaultFieldExtractor.java Progetto: doaaanwar/scraper

 private List<Field> extractFieldsFromTable(String html) {
   // log.debug("extracting fields from table: {}", html);
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   int cellCount = source.getAllElements(HTMLElementName.TD).size();
   int rowCount = source.getAllElements(HTMLElementName.TR).size();
   log.debug("found {} cells in {} rows", cellCount, rowCount);
   if (cellCount == (rowCount * 2)) {
     Field lastField = null;
     log.debug("cells.size: {}", cellCount);
     List<Element> cells = source.getAllElements(HTMLElementName.TD);
     for (int i = 0; i < cellCount; i++) {
       Element labelElement = cells.get(i);
       Element valueElement = cells.get(++i);
       String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", "");
       String value = getValueFieldText(valueElement);
       log.debug("found field: {}={}", label, value);
       if (StringUtils.isEmpty(label) && lastField != null) {
         lastField.addValue(value);
       } else {
         lastField = new ScrapedField(label, value);
         extractedFields.add(lastField);
       }
     }
   } else {
     List<String> headers = new ArrayList<String>();
     List<Element> rows = source.getAllElements(HTMLElementName.TR);
     for (Element row : rows) {
       List<Element> headerElements = row.getAllElements(HTMLElementName.TH);
       if (headerElements.size() > 0) {
         headers.clear();
       }
       for (Element headerElement : headerElements) {
         String header = headerElement.getTextExtractor().toString();
         headers.add(header);
         log.debug("header text: {}", header);
       }
       List<Element> cells = row.getAllElements(HTMLElementName.TD);
       if (cells.size() > 0) {
         for (int n = headers.size(); n < cells.size(); n++) {
           headers.add("col" + n);
         }
         int index = 0;
         for (Element cell : cells) {
           String label = headers.get(index++);
           String value = getValueFieldText(cell);
           extractedFields.add(new ScrapedField(label, value));
         }
         headers.clear();
       }
     }
   }
   return extractedFields;
 }

Esempio n. 2

Mostra file

File: DefaultFieldExtractor.java Progetto: doaaanwar/scraper

 private List<Field> extractFieldsFromDL(String html) {
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   List<Element> labels = source.getAllElements(HTMLElementName.DT);
   List<Element> values = source.getAllElements(HTMLElementName.DD);
   int cellCount = Math.min(labels.size(), values.size());
   for (int i = 0; i < cellCount; i++) {
     String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
     Element valueElement = values.get(i);
     log.debug("looking at value element: {}", valueElement);
     String value = getValueFieldText(valueElement);
     extractedFields.add(new ScrapedField(label, value));
   }
   return extractedFields;
 }

Esempio n. 3

Mostra file

File: DefaultFieldExtractor.java Progetto: doaaanwar/scraper

 private List<Field> extractFieldsFromUL(String html) {
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   List<Element> lis = source.getAllElements(HTMLElementName.LI);
   for (Element li : lis) {
     log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString());
     String[] parts = li.getTextExtractor().toString().split(":");
     if (parts.length == 2) {
       Field field = new ScrapedField(parts[0], parts[1]);
       extractedFields.add(field);
       log.debug("found <li> to process: {}, added field: {}", li, field);
     } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) {
       Tag enclosingTag = li.getAllTags().get(1);
       log.debug("enclosing tag: {}", enclosingTag);
       log.debug(
           "first element of enclosing tag: {}",
           enclosingTag.getElement().getTextExtractor().toString());
       String tagText =
           enclosingTag
               .getElement()
               .getRenderer()
               .setMaxLineLength(Integer.MAX_VALUE)
               .toString()
               .trim()
               .replaceAll(":$", "");
       String allText =
           li.getRenderer()
               .setMaxLineLength(Integer.MAX_VALUE)
               .toString()
               .trim()
               .replaceAll(":$", "");
       log.debug("enclosing tag text starts at: {}", allText.indexOf(tagText));
       log.debug(
           "tagText (length = {}): {} alltext (length = {}): {}",
           new Object[] {tagText.length(), tagText, allText.length(), allText});
       if (allText.startsWith(tagText)) {
         String valueText =
             (allText.length() > tagText.length()) ? allText.substring(tagText.length() + 1) : "";
         extractedFields.add(new ScrapedField(tagText, valueText));
         log.debug("extracted fields = {}", extractedFields);
       }
     }
   }
   return extractedFields;
 }

Esempio n. 4

Mostra file

File: ExtractText.java Progetto: karimEssawi/TwitterMining

  public static void main(String[] args) throws Exception {
    String sourceUrlString = null;
    PrintWriter writer = null;
    File Directory = new File("asd\\Web Pages\\HousingAndEnvironment");
    if (Directory.isDirectory()) {
      for (File f : Directory.listFiles()) {
        sourceUrlString = Directory.getPath() + "\\" + f.getName();
        if (sourceUrlString.indexOf(':') == -1) sourceUrlString = "file:" + sourceUrlString;
        // reader = new BufferedReader(new FileReader(sourceUrlString));
        writer = new PrintWriter("asd\\Web Pages\\HousingAndEnvironment" + f.getName() + ".txt");
        Source source = new Source(new URL(sourceUrlString));
        source.fullSequentialParse();
        writer.print(source.getTextExtractor().setIncludeAttributes(true).toString());
        writer.flush();
        writer.close();

        //		MicrosoftConditionalCommentTagTypes.register();
        //		PHPTagTypes.register();
        //		PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise
        // they override processing instructions
        //		MasonTagTypes.register();

        // Call fullSequentialParse manually as most of the source will be parsed.

        //		System.out.println("Document title:");
        //		String title=getTitle(source);
        //		System.out.println(title==null ? "(none)" : title);

        //		System.out.println("\nDocument description:");
        //		String description=getMetaValue(source,"description");
        //		System.out.println(description==null ? "(none)" : description);
        //
        //		System.out.println("\nDocument keywords:");
        //		String keywords=getMetaValue(source,"keywords");
        //		System.out.println(keywords==null ? "(none)" : keywords);
        //
        //		System.out.println("\nLinks to other documents:");
        //		List<Element> linkElements=source.getAllElements(HTMLElementName.A);
        //		for (Element linkElement : linkElements) {
        //			String href=linkElement.getAttributeValue("href");
        //			if (href==null) continue;
        //			// A element can contain other tags so need to extract the text from it:
        //			String label=linkElement.getContent().getTextExtractor().toString();
        //			System.out.println(label+" <"+href+'>');
        //		}

        //		System.out.println("\nAll text from file (exluding content inside SCRIPT and STYLE
        // elements):\n");
        //		System.out.println(source.getTextExtractor().setIncludeAttributes(true).toString());

        // System.out.println("\nSame again but this time extend the TextExtractor class to also
        // exclude text from P elements and any elements with class=\"control\":\n");
        //		TextExtractor textExtractor=new TextExtractor(source) {
        //			public boolean excludeElement(StartTag startTag) {
        //				return startTag.getName()==HTMLElementName.P ||
        // "control".equalsIgnoreCase(startTag.getAttributeValue("class"));
        //			}
        //		};
        //		System.out.println(textExtractor.setIncludeAttributes(true).toString());
      }
    }
  }

Esempio n. 5

Mostra file

File: DefaultFieldExtractor.java Progetto: doaaanwar/scraper

 private boolean fieldHasMultipleValues(String fieldValue) {
   Source source = new Source(fieldValue);
   source.fullSequentialParse();
   return source.getAllElements(HTMLElementName.BR).size() > 1;
 }