示例#1
0
  public LocalezeParser(LocalezeParserUtils Lparserutils) {
    parserutils = Lparserutils;
    mappedExistingIDs = parserutils.getIDMap();
    existingWhereIDs = mappedExistingIDs.getValues();

    // find whereid max
    for (int i = 0; i < existingWhereIDs.length; i++)
      if (existingWhereIDs[i] > startId_) startId_ = i;
    System.out.println("Max of existing whereids is: " + startId_);
  }
示例#2
0
  /**
   * Convert a localeze record to a place object
   *
   * @param record - input localeze record
   * @return place
   */
  public Place toPlace(String record) {
    String[] bits = record.split("\\|");
    if (bits == null) {
      throw new RuntimeException("bits is null");
    }
    if (bits.length < 47) {
      throw new RuntimeException("bits.length (" + bits.length + ") < 47");
    }
    // new place
    Place place = new Place();
    place.setSource(Place.Source.LOCALEZE);
    place.setNativeId(bits[0].trim());
    place.setShortname(bits[3]);
    place.setName(bits[4]);

    // calc whereid
    String whereIDtoWrite;
    long pid = Long.parseLong(place.getNativeId());

    if (mappedExistingIDs.containsKey(pid)) {
      // this pid already exists, pull whereid from .map
      whereIDtoWrite = new Long(mappedExistingIDs.get(pid)).toString();
    } else {
      // increment and get max, use as new whereid
      whereIDtoWrite = new Long(++startId_).toString();
    }

    place.setWhereId(whereIDtoWrite);

    // address
    Address addr = new Address();
    String street1 = new String();
    for (int i = 6; i < 10; i++) {
      street1 += bits[i] + " ";
    }
    street1 = street1.replaceAll("\\s+", " ").trim();
    addr.setAddress1(street1);
    String street2 = new String();
    for (int i = 10; i < 13; i++) {
      street2 += bits[i] + " ";
    }
    street2 = street2.replaceAll("\\s+", " ").trim();
    addr.setAddress2(street2);
    addr.setNeighborhood(bits[13].trim());
    addr.setCity(bits[14].trim());
    addr.setState(bits[15].trim());

    boolean stringIsBlank = (bits[17].isEmpty()) || (bits[17] == null);
    addr.setZip(stringIsBlank ? bits[16].trim() : bits[16].trim() + "-" + bits[17].trim());

    place.setAddress(addr);

    // geo
    double lat = Double.parseDouble(bits[45].trim());
    double lng = Double.parseDouble(bits[46].trim());
    place.setLatlng(new double[] {lat, lng});
    String geohash = GeoHashUtils.encode(lat, lng);
    place.setGeohash(geohash);

    // phone
    place.setPhone(bits[34].trim() + bits[35].trim() + bits[36].trim());

    return place;
  }
  @Override
  public Object preProcess(DocumentSet training, Object transientData) {
    inverseIndex = new HashMap<>();
    double[] expectedWeightPerLabelHint = new double[2];
    System.out.println("Preparing clustering");

    // class counts
    training.forEach(
        (docId, doc) -> {
          int cls = doc.getClasses().contains(className) ? 1 : 0;
          expectedWeightPerLabelHint[cls] += 1;
        });

    Function<FeatureNode, Double> estimateInfoGain =
        (f) -> {
          InfoGainCalc calc =
              new InfoGainCalc(2, false, f.featureName, InfoGainCalc.EstimationEnum.INFO_GAIN);
          f.values.forEachEntry(
              (id, v) -> {
                calc.addSample(
                    training.document(id).getClasses().contains(className) ? 1 : 0,
                    v / InfoGainCalc.PRECISION_INV);
                return true;
              });
          calc.setExpectedWeightPerLabelHint(expectedWeightPerLabelHint);
          return calc.estimateRelevance();
        };

    Function<FeatureNode, Double> evaluator =
        (f) -> {
          return f.eval;
        };

    BiFunction<FeatureNode, FeatureNode, FeatureNode> combiner =
        (f1, f2) -> {
          FeatureNode combined = new FeatureNode(null);

          f1.values.forEachEntry(
              (id, v) -> {
                combined.values.put(id, v);
                return true;
              });
          f2.values.forEachEntry(
              (id, v) -> {
                long o = combined.values.get(id);
                if (isMaxCombiner) {
                  combined.values.put(id, v > o ? v : o);
                } else { // if isSumCombiner
                  combined.values.put(id, o + v);
                }
                return true;
              });

          combined.eval = estimateInfoGain.apply(combined);
          return combined;
        };

    // calculate feature-doc map
    Map<String, TLongLongHashMap> fmap = new HashMap<>();
    training.forEach(
        (docId, doc) -> {
          doc.getFeatureSet(sourceFeatureSet)
              .forEach(
                  (f) -> {
                    TLongLongHashMap fmapt = fmap.get(f.getName());
                    if (fmapt == null) {
                      fmapt = new TLongLongHashMap();
                      fmap.put(f.getName(), fmapt);
                    }
                    fmapt.put(
                        docId, (long) (((Double) f.doubleValue()) * InfoGainCalc.PRECISION_INV));
                  });
        });

    ArrayList<FeatureNode> arrTemp = new ArrayList<>();
    // convert to featureNodes
    fmap.forEach(
        (fname, arr) -> {
          FeatureNode node = new FeatureNode(fname, arr);
          node.eval = estimateInfoGain.apply(node);
          arrTemp.add(node);
        });

    // optimization: get only best info gain features
    Collections.sort(arrTemp, (n1, n2) -> -Double.compare(n1.eval, n2.eval));
    for (int i = 0; i < arrTemp.size() && i < nOfBestToUse; i++) {
      inverseIndex.put(arrTemp.get(i).featureName, arrTemp.get(i));
    }

    System.out.println("Doing clustering");

    AgglomerativeSampling<FeatureNode> clustering =
        new AgglomerativeSampling<>(evaluator, combiner, inverseIndex.values());

    clustering.setMaxSamples(nOfBestToUse);
    clustering.doClustering(nClusters);

    // collect clusters
    clusters = new HashMap<>();
    clustering.forEachCluster(
        (c) -> {
          Set<String> features = new HashSet<>();
          c.forEachLeaf((l) -> features.add(l.getPoint().featureName));
          if (features.size() >= 1) {
            clusters.put(clusters.size(), features);
          }
        });

    // release memory
    inverseIndex = null;
    return null;
  }