/** * Take webpage content and do the following: 1)identify where soft segments are 2)compute score * based on distribution of terms and similarity 3)move segments around */ private void groupSegments(Document doc) { Cluster cluster = new Cluster(); boolean isSameCluster; int textSize = doc.body().text().split(" ").length; int count = 0; for (int i = 0; i < webPageSegments.size(); i++) { isSameCluster = cluster.isSameCluster(webPageSegments.get(i)); if (!isSameCluster) { if (cluster.segments.size() != 0) { count += cluster.getSegmentText().split(" ").length; double position = count / ((double) textSize); if (cluster.getText().trim().length() > 100) { addNewWebPageSection(cluster, position); } } cluster = new Cluster(); } cluster.addSegment(webPageSegments.get(i)); } // last cluster if (cluster != null) { count += cluster.getSegmentText().split(" ").length; double position = count / ((double) textSize); if (cluster.getText().trim().length() > 100) { addNewWebPageSection(cluster, position); } } }
private void addNewWebPageSection(Cluster cluster, double pageLocation) { WebPageSection webPageSection = new WebPageSection(); webPageSection.addCluster(cluster); // Need to add segment name webPageSection.setSectionSize(cluster.getSegmentText().split(" ").length); CharacterBasedDistribution distribution = new CharacterBasedDistribution(cluster.getSegmentText()); webPageSection.setDistrbution(distribution.distribution); webPageSection.setPageLocation(pageLocation); clusters.add(webPageSection); }