/** * Parse search results from a search result site * * @param pUrl */ private void parseSearchResults(String pUrl) { LOGGER.info("Started parsing: " + pUrl); Document doc = null; doc = ParserUtils.connectGetUrl(ParserUtils.getUri(pUrl).toASCIIString()); doc.setBaseUri(DEFAULT_VSP_URL); Elements results = doc.select("div[class*=map-list-item]"); for (Element result : results) { PersistentEntity ent = new PersistentEntity(); Elements infoElement = result.select("div[class*=info-content]"); LOGGER.debug(infoElement.select("p[class*=establishment-category]").first().ownText()); String tmp = result .select("div[class*=info-content]") .select("p[class*=establishment-category]") .first() .ownText(); ent.setIndustry(new Utf8(tmp.split("/")[0])); ent.setLabel(new Utf8(tmp)); // getting same as value to where it is EylloLink link = ParserUtils.detectUrl( infoElement.select("p[class*=establishment-name]").select("a").first()); if (link != null) { LOGGER.debug(DEFAULT_VSP_URL + link.getLinkHref()); ent.putToSameAs( new Utf8(DEFAULT_VSP_URL + link.getLinkHref()), new Utf8(link.getLinkText())); ent.setName(new Utf8(link.getLinkText())); } // getting its address and phone PersistentPoint point = new PersistentPoint(); infoElement = result.select("div[class*=establishment-details]").select("p"); ent.addToTelephones(new Utf8(infoElement.get(0).ownText())); point.setAddress(new Utf8(infoElement.get(0).text())); if (!result.attr("data-lng").toString().equals("") && !result.attr("data-lat").toString().equals("")) { // Format in [lon, lat], note, the order of lon/lat here in order to conform with GeoJSON. point.addToCoordinates(Double.parseDouble(result.attr("data-lng"))); point.addToCoordinates(Double.parseDouble(result.attr("data-lat"))); point.setAccuracy(EylloLocation.GEOCODER_VERIF_ACC_HIGH); } ent.setPersistentpoint(point); ent.addToScenarioId(getScenarioId()); this.pEntities.add(ent); } LOGGER.info("Completed getting basic information from entities."); }
/** * Parsing existing entities * * @param pEntity */ public void parseIndividualEnt(PersistentEntity pEntity) { Document doc = null; Iterator<Entry<Utf8, Utf8>> it = pEntity.getSameAs().entrySet().iterator(); while (it.hasNext()) { Map.Entry<Utf8, Utf8> pairs = (Map.Entry<Utf8, Utf8>) it.next(); // Reading individual URLs LOGGER.info( "Parsing entity from: " + ParserUtils.getUri(pairs.getKey().toString()).toASCIIString()); doc = ParserUtils.connectGetUrl(ParserUtils.getUri(pairs.getKey().toString()).toASCIIString()); if (doc == null && !validateSite(doc)) { break; } else { doc.setBaseUri(VejaSaoPauloParser.DEFAULT_VSP_URL); StringBuilder strBuilder = new StringBuilder(); // getting working hours Elements workElems = doc.select("div[class*=information-unwanted]").select("div[class*=working-hours]"); if (workElems != null && workElems.size() > 0) { for (Element info : workElems.select("div[class*=hours]").select("p")) strBuilder.append(info.text().replace("-", "_")).append(ParserProperties.INFO_SEP); pEntity.setSchedule(new Utf8(strBuilder.toString())); } // getting price range workElems = doc.select("div[class*=information-unwanted]") .select("div[class*=price]") .select("p[class*=price-range]"); strBuilder.delete(0, strBuilder.length()); if (workElems != null && workElems.size() > 0) { strBuilder.append( doc.select("div[class*=price]").select("h3").first().text() + ParserProperties.DESC_SEP); strBuilder.append(workElems.text()); pEntity.addToExtraInfo(new Utf8(strBuilder.toString())); // LOGGER.debug(strBuilder.toString()); } // getting payment information workElems = doc.select("div[class*=information-unwanted]") .select("div[class*=payment]") .select("p"); strBuilder.delete(0, strBuilder.length()); if (workElems != null && workElems.size() > 0) { strBuilder.append( doc.select("div[class*=payment]").select("h3").first().text() + ParserProperties.DESC_SEP); for (Element infoElem : workElems) if (!infoElem.text().trim().equals("")) { strBuilder.append(infoElem.text().trim() + ParserProperties.INFO_SEP); } } // END-IF_PAYMENT // getting services provided information workElems = doc.select("div[class*=information-unwanted]") .select("div[class*=services]") .select("div[class*=information-services]") .select("p"); strBuilder.delete(0, strBuilder.length()); if (workElems != null && workElems.size() > 0) { for (Element infoElem : workElems) { if (infoElem.hasClass("observation")) { pEntity.addToExtraInfo(new Utf8("Observation :" + infoElem.text())); } else if (!infoElem.text().equals("")) { pEntity.addToServices(new Utf8(infoElem.text())); } // LOGGER.debug(infoElem.text()); } } // END-IF_SERVICES // getting home url workElems = doc.select("div[class*=information-unwanted]").select("div[class*=website]"); if (workElems != null && workElems.size() > 0) { EylloLink homeLink = ParserUtils.detectUrl( workElems .select("div[class*=information-website]") .select("p") .select("a") .first()); if (homeLink != null) { pEntity.setHomepage(new Utf8(homeLink.getLinkHref())); pEntity.putToSameAs(new Utf8(homeLink.getLinkHref()), new Utf8(homeLink.getLinkText())); } } // END-IF_URL pEntity.setDescription(new Utf8("")); } // END-IF_VALID_URL } // END-WHILE }