Пример #1
0
  /**
   * Take webpage content and do the following: 1)identify where soft segments are 2)compute score
   * based on distribution of terms and similarity 3)move segments around
   */
  private void groupSegments(Document doc) {

    Cluster cluster = new Cluster();
    boolean isSameCluster;
    int textSize = doc.body().text().split(" ").length;
    int count = 0;

    for (int i = 0; i < webPageSegments.size(); i++) {

      isSameCluster = cluster.isSameCluster(webPageSegments.get(i));

      if (!isSameCluster) {

        if (cluster.segments.size() != 0) {
          count += cluster.getSegmentText().split(" ").length;
          double position = count / ((double) textSize);
          if (cluster.getText().trim().length() > 100) {
            addNewWebPageSection(cluster, position);
          }
        }
        cluster = new Cluster();
      }
      cluster.addSegment(webPageSegments.get(i));
    }

    // last cluster
    if (cluster != null) {
      count += cluster.getSegmentText().split(" ").length;
      double position = count / ((double) textSize);
      if (cluster.getText().trim().length() > 100) {
        addNewWebPageSection(cluster, position);
      }
    }
  }
Пример #2
0
  private void addNewWebPageSection(Cluster cluster, double pageLocation) {
    WebPageSection webPageSection = new WebPageSection();
    webPageSection.addCluster(cluster);

    // Need to add segment name
    webPageSection.setSectionSize(cluster.getSegmentText().split(" ").length);

    CharacterBasedDistribution distribution =
        new CharacterBasedDistribution(cluster.getSegmentText());
    webPageSection.setDistrbution(distribution.distribution);

    webPageSection.setPageLocation(pageLocation);
    clusters.add(webPageSection);
  }