コード例 #1
0
  /**
   * Parse search results from a search result site
   *
   * @param pUrl
   */
  private void parseSearchResults(String pUrl) {
    LOGGER.info("Started parsing: " + pUrl);
    Document doc = null;

    doc = ParserUtils.connectGetUrl(ParserUtils.getUri(pUrl).toASCIIString());
    doc.setBaseUri(DEFAULT_VSP_URL);
    Elements results = doc.select("div[class*=map-list-item]");
    for (Element result : results) {
      PersistentEntity ent = new PersistentEntity();
      Elements infoElement = result.select("div[class*=info-content]");
      LOGGER.debug(infoElement.select("p[class*=establishment-category]").first().ownText());
      String tmp =
          result
              .select("div[class*=info-content]")
              .select("p[class*=establishment-category]")
              .first()
              .ownText();

      ent.setIndustry(new Utf8(tmp.split("/")[0]));
      ent.setLabel(new Utf8(tmp));
      // getting same as value to where it is
      EylloLink link =
          ParserUtils.detectUrl(
              infoElement.select("p[class*=establishment-name]").select("a").first());
      if (link != null) {
        LOGGER.debug(DEFAULT_VSP_URL + link.getLinkHref());
        ent.putToSameAs(
            new Utf8(DEFAULT_VSP_URL + link.getLinkHref()), new Utf8(link.getLinkText()));
        ent.setName(new Utf8(link.getLinkText()));
      }
      // getting its address and phone
      PersistentPoint point = new PersistentPoint();
      infoElement = result.select("div[class*=establishment-details]").select("p");
      ent.addToTelephones(new Utf8(infoElement.get(0).ownText()));
      point.setAddress(new Utf8(infoElement.get(0).text()));
      if (!result.attr("data-lng").toString().equals("")
          && !result.attr("data-lat").toString().equals("")) {
        // Format in [lon, lat], note, the order of lon/lat here in order to conform with GeoJSON.
        point.addToCoordinates(Double.parseDouble(result.attr("data-lng")));
        point.addToCoordinates(Double.parseDouble(result.attr("data-lat")));
        point.setAccuracy(EylloLocation.GEOCODER_VERIF_ACC_HIGH);
      }
      ent.setPersistentpoint(point);
      ent.addToScenarioId(getScenarioId());

      this.pEntities.add(ent);
    }
    LOGGER.info("Completed getting basic information from entities.");
  }
コード例 #2
0
  /**
   * Parsing existing entities
   *
   * @param pEntity
   */
  public void parseIndividualEnt(PersistentEntity pEntity) {
    Document doc = null;
    Iterator<Entry<Utf8, Utf8>> it = pEntity.getSameAs().entrySet().iterator();
    while (it.hasNext()) {
      Map.Entry<Utf8, Utf8> pairs = (Map.Entry<Utf8, Utf8>) it.next();

      // Reading individual URLs
      LOGGER.info(
          "Parsing entity from: " + ParserUtils.getUri(pairs.getKey().toString()).toASCIIString());
      doc =
          ParserUtils.connectGetUrl(ParserUtils.getUri(pairs.getKey().toString()).toASCIIString());
      if (doc == null && !validateSite(doc)) {
        break;
      } else {
        doc.setBaseUri(VejaSaoPauloParser.DEFAULT_VSP_URL);
        StringBuilder strBuilder = new StringBuilder();
        // getting working hours
        Elements workElems =
            doc.select("div[class*=information-unwanted]").select("div[class*=working-hours]");
        if (workElems != null && workElems.size() > 0) {
          for (Element info : workElems.select("div[class*=hours]").select("p"))
            strBuilder.append(info.text().replace("-", "_")).append(ParserProperties.INFO_SEP);
          pEntity.setSchedule(new Utf8(strBuilder.toString()));
        }

        // getting price range
        workElems =
            doc.select("div[class*=information-unwanted]")
                .select("div[class*=price]")
                .select("p[class*=price-range]");
        strBuilder.delete(0, strBuilder.length());
        if (workElems != null && workElems.size() > 0) {
          strBuilder.append(
              doc.select("div[class*=price]").select("h3").first().text()
                  + ParserProperties.DESC_SEP);
          strBuilder.append(workElems.text());
          pEntity.addToExtraInfo(new Utf8(strBuilder.toString()));
          // LOGGER.debug(strBuilder.toString());
        }

        // getting payment information
        workElems =
            doc.select("div[class*=information-unwanted]")
                .select("div[class*=payment]")
                .select("p");
        strBuilder.delete(0, strBuilder.length());
        if (workElems != null && workElems.size() > 0) {
          strBuilder.append(
              doc.select("div[class*=payment]").select("h3").first().text()
                  + ParserProperties.DESC_SEP);
          for (Element infoElem : workElems)
            if (!infoElem.text().trim().equals("")) {
              strBuilder.append(infoElem.text().trim() + ParserProperties.INFO_SEP);
            }
        } // END-IF_PAYMENT

        // getting services provided information
        workElems =
            doc.select("div[class*=information-unwanted]")
                .select("div[class*=services]")
                .select("div[class*=information-services]")
                .select("p");
        strBuilder.delete(0, strBuilder.length());
        if (workElems != null && workElems.size() > 0) {
          for (Element infoElem : workElems) {
            if (infoElem.hasClass("observation")) {
              pEntity.addToExtraInfo(new Utf8("Observation :" + infoElem.text()));
            } else if (!infoElem.text().equals("")) {
              pEntity.addToServices(new Utf8(infoElem.text()));
            }
            // LOGGER.debug(infoElem.text());
          }
        } // END-IF_SERVICES

        // getting home url
        workElems = doc.select("div[class*=information-unwanted]").select("div[class*=website]");
        if (workElems != null && workElems.size() > 0) {
          EylloLink homeLink =
              ParserUtils.detectUrl(
                  workElems
                      .select("div[class*=information-website]")
                      .select("p")
                      .select("a")
                      .first());
          if (homeLink != null) {
            pEntity.setHomepage(new Utf8(homeLink.getLinkHref()));
            pEntity.putToSameAs(new Utf8(homeLink.getLinkHref()), new Utf8(homeLink.getLinkText()));
          }
        } // END-IF_URL
        pEntity.setDescription(new Utf8(""));
      } // END-IF_VALID_URL
    } // END-WHILE
  }