public LocalezeParser(LocalezeParserUtils Lparserutils) { parserutils = Lparserutils; mappedExistingIDs = parserutils.getIDMap(); existingWhereIDs = mappedExistingIDs.getValues(); // find whereid max for (int i = 0; i < existingWhereIDs.length; i++) if (existingWhereIDs[i] > startId_) startId_ = i; System.out.println("Max of existing whereids is: " + startId_); }
/** * Convert a localeze record to a place object * * @param record - input localeze record * @return place */ public Place toPlace(String record) { String[] bits = record.split("\\|"); if (bits == null) { throw new RuntimeException("bits is null"); } if (bits.length < 47) { throw new RuntimeException("bits.length (" + bits.length + ") < 47"); } // new place Place place = new Place(); place.setSource(Place.Source.LOCALEZE); place.setNativeId(bits[0].trim()); place.setShortname(bits[3]); place.setName(bits[4]); // calc whereid String whereIDtoWrite; long pid = Long.parseLong(place.getNativeId()); if (mappedExistingIDs.containsKey(pid)) { // this pid already exists, pull whereid from .map whereIDtoWrite = new Long(mappedExistingIDs.get(pid)).toString(); } else { // increment and get max, use as new whereid whereIDtoWrite = new Long(++startId_).toString(); } place.setWhereId(whereIDtoWrite); // address Address addr = new Address(); String street1 = new String(); for (int i = 6; i < 10; i++) { street1 += bits[i] + " "; } street1 = street1.replaceAll("\\s+", " ").trim(); addr.setAddress1(street1); String street2 = new String(); for (int i = 10; i < 13; i++) { street2 += bits[i] + " "; } street2 = street2.replaceAll("\\s+", " ").trim(); addr.setAddress2(street2); addr.setNeighborhood(bits[13].trim()); addr.setCity(bits[14].trim()); addr.setState(bits[15].trim()); boolean stringIsBlank = (bits[17].isEmpty()) || (bits[17] == null); addr.setZip(stringIsBlank ? bits[16].trim() : bits[16].trim() + "-" + bits[17].trim()); place.setAddress(addr); // geo double lat = Double.parseDouble(bits[45].trim()); double lng = Double.parseDouble(bits[46].trim()); place.setLatlng(new double[] {lat, lng}); String geohash = GeoHashUtils.encode(lat, lng); place.setGeohash(geohash); // phone place.setPhone(bits[34].trim() + bits[35].trim() + bits[36].trim()); return place; }
@Override public Object preProcess(DocumentSet training, Object transientData) { inverseIndex = new HashMap<>(); double[] expectedWeightPerLabelHint = new double[2]; System.out.println("Preparing clustering"); // class counts training.forEach( (docId, doc) -> { int cls = doc.getClasses().contains(className) ? 1 : 0; expectedWeightPerLabelHint[cls] += 1; }); Function<FeatureNode, Double> estimateInfoGain = (f) -> { InfoGainCalc calc = new InfoGainCalc(2, false, f.featureName, InfoGainCalc.EstimationEnum.INFO_GAIN); f.values.forEachEntry( (id, v) -> { calc.addSample( training.document(id).getClasses().contains(className) ? 1 : 0, v / InfoGainCalc.PRECISION_INV); return true; }); calc.setExpectedWeightPerLabelHint(expectedWeightPerLabelHint); return calc.estimateRelevance(); }; Function<FeatureNode, Double> evaluator = (f) -> { return f.eval; }; BiFunction<FeatureNode, FeatureNode, FeatureNode> combiner = (f1, f2) -> { FeatureNode combined = new FeatureNode(null); f1.values.forEachEntry( (id, v) -> { combined.values.put(id, v); return true; }); f2.values.forEachEntry( (id, v) -> { long o = combined.values.get(id); if (isMaxCombiner) { combined.values.put(id, v > o ? v : o); } else { // if isSumCombiner combined.values.put(id, o + v); } return true; }); combined.eval = estimateInfoGain.apply(combined); return combined; }; // calculate feature-doc map Map<String, TLongLongHashMap> fmap = new HashMap<>(); training.forEach( (docId, doc) -> { doc.getFeatureSet(sourceFeatureSet) .forEach( (f) -> { TLongLongHashMap fmapt = fmap.get(f.getName()); if (fmapt == null) { fmapt = new TLongLongHashMap(); fmap.put(f.getName(), fmapt); } fmapt.put( docId, (long) (((Double) f.doubleValue()) * InfoGainCalc.PRECISION_INV)); }); }); ArrayList<FeatureNode> arrTemp = new ArrayList<>(); // convert to featureNodes fmap.forEach( (fname, arr) -> { FeatureNode node = new FeatureNode(fname, arr); node.eval = estimateInfoGain.apply(node); arrTemp.add(node); }); // optimization: get only best info gain features Collections.sort(arrTemp, (n1, n2) -> -Double.compare(n1.eval, n2.eval)); for (int i = 0; i < arrTemp.size() && i < nOfBestToUse; i++) { inverseIndex.put(arrTemp.get(i).featureName, arrTemp.get(i)); } System.out.println("Doing clustering"); AgglomerativeSampling<FeatureNode> clustering = new AgglomerativeSampling<>(evaluator, combiner, inverseIndex.values()); clustering.setMaxSamples(nOfBestToUse); clustering.doClustering(nClusters); // collect clusters clusters = new HashMap<>(); clustering.forEachCluster( (c) -> { Set<String> features = new HashSet<>(); c.forEachLeaf((l) -> features.add(l.getPoint().featureName)); if (features.size() >= 1) { clusters.put(clusters.size(), features); } }); // release memory inverseIndex = null; return null; }