/** * Default constructor with the full set of parameters. * * @param delta1 Delta value (suggested 6). <br> * It directly affects the number of events. Increasing this value, reduces the number of them * and vice versa. * @param delta2 Delta2 value. <br> * Prime divisors of the number of documents are required as values. It must be * cross-referenced with the number of documents. More specifically, the outcome of the * division between the number of documents and this metric should result the number of total * windows. * @param gamma Gamma value (suggested 5). <br> * It affects the quality of the uncovered events. Values greater than 15, seem to increase * the number of the uncovered events. * @param minTermSupport Minimum term support value (suggested 0.0001). <br> * Changing this value would result in altering the lower bound below which a term should not * be included in the keywords list of an event. * @param maxTermSupport Maximum term support value (suggested 0.01). <br> * Changing this value would result in altering the upper bound above which a term should not * be included in the keywords list of an event. * @param timeSliceA Starting timeslice. * @param timeSliceB Ending timeslice. * @param corpus An EDCoWCorpus object. * @see #EDCoW(int, int, int, EDCoWCorpus) EDCoW() minimum constructor. */ public EDCoW( int delta1, int delta2, int gamma, double minTermSupport, double maxTermSupport, int timeSliceA, int timeSliceB, EDCoWCorpus corpus) { this.delta = delta1; this.delta2 = delta2; this.gamma = gamma; this.minTermSupport = minTermSupport; this.maxTermSupport = maxTermSupport; this.timeSliceA = timeSliceA; this.timeSliceB = timeSliceB; this.countCorpus = 0; this.corpus = corpus; for (Integer numberOfDocument : corpus.getNumberOfDocuments()) { this.countCorpus += numberOfDocument; } }
/** * Method to run the algorithm and analyze terms and frequencies in a specific window. * * @param window The window index (0, 1, 2 etc). * @throws java.lang.Exception General Exception. */ public void processWindow(int window) throws Exception { LinkedList<EDCoWKeyword> keyWords = new LinkedList<>(); Integer[] distributioni = corpus.getNumberOfDocuments(); double[] distributiond = new double[delta2]; int startSlice = window * delta2; int endSlice = startSlice + delta2 - 1; for (int i = startSlice; i < endSlice; i++) { distributiond[i - startSlice] = (double) distributioni[i]; } termDocMap .entrySet() .stream() .forEach( (entry) -> { Integer frequencyf[] = entry.getValue(); double frequencyd[] = new double[delta2]; for (int i = startSlice; i < endSlice; i++) { frequencyd[i - startSlice] = (double) frequencyf[i]; } keyWords.add(new EDCoWKeyword(entry.getKey(), frequencyd, delta, distributiond)); }); double[] autoCorrelationValues = new double[keyWords.size()]; for (int i = 0; i < keyWords.size(); i++) { autoCorrelationValues[i] = keyWords.get(i).getAutoCorrelation(); } EDCoWThreshold th1 = new EDCoWThreshold(); double theta1 = th1.theta1(autoCorrelationValues, gamma); // Removing trivial keywords based on theta1 LinkedList<EDCoWKeyword> keyWordsList1 = new LinkedList<>(); keyWords .stream() .filter((k) -> (k.getAutoCorrelation() > theta1)) .forEach( (k) -> { keyWordsList1.add(k); }); keyWordsList1 .stream() .forEach( (kw1) -> { kw1.computeCrossCorrelation(keyWordsList1); }); double[][] bigMatrix = new double[keyWordsList1.size()][keyWordsList1.size()]; for (int i = 0; i < keyWordsList1.size(); i++) { bigMatrix[i] = keyWordsList1.get(i).getCrossCorrelation(); } // Compute theta2 using the BigMatrix double theta2 = th1.theta2(bigMatrix, gamma); for (int i = 0; i < keyWordsList1.size(); i++) { for (int j = i + 1; j < keyWordsList1.size(); j++) { bigMatrix[i][j] = (bigMatrix[i][j] < theta2) ? 0 : bigMatrix[i][j]; } } EDCoWModularityDetection modularity = new EDCoWModularityDetection(keyWordsList1, bigMatrix, startSlice, endSlice); double thresholdE = 0.1; ArrayList<Community> finalArrCom = modularity.getCommunitiesFiltered(thresholdE); finalArrCom .stream() .map( (c) -> { System.out.println(c.getCommunitySize()); return c; }) .forEach( (c) -> { modularity.saveEventFromCommunity(c); }); eventList.addAll(modularity.getEvents()); }
@Override public void apply() { long startTime = System.currentTimeMillis(); double minTermOccur = minTermSupport * countCorpus; // Min support * Message count corpus double maxTermOccur = maxTermSupport * countCorpus; // Max support * Message count corpus int windows = (timeSliceB - timeSliceA) / delta2; termDocMap = new HashMap<>(); eventList = new LinkedList<>(); PrintUtilities.printInfoMessageln("Calculating term frequencies..."); List<String> terms = corpus.getTerms(); for (int i = 0; i < terms.size(); i++) { String term = terms.get(i); if (term.length() > 1) { // Stopwords check removed as they are already ommitted when creating the dataset Integer[] frequency = corpus.getDocumentsTermFrequency(i); int cf = 0; for (int freq : frequency) { cf += freq; } if (cf > minTermOccur && cf < maxTermOccur) { termDocMap.put(term, frequency); } } } PrintUtilities.printInfoMessageln("Calculating windows..."); for (int i = 0; i < windows; i++) { PrintUtilities.printInfoMessageln("Calculating window " + (i + 1) + "\n"); try { processWindow(i); } catch (Exception ex) { Logger.getLogger(EDCoW.class.getName()).log(Level.SEVERE, null, ex); } } Collections.sort(eventList); events = new EDCoWEvents(); eventList .stream() .forEach( (event) -> { // try { events.list.add( new EDCoWEvent( event.getKeywordsIDsAsString(), corpus.getDateFromTimeSlice((int) event.startSlice) + "," + corpus.getDateFromTimeSlice((int) event.endSlice - 1), corpus.getIDsOfWindowAsString( corpus.getDateFromTimeSlice((int) event.startSlice), corpus.getDateFromTimeSlice((int) event.endSlice - 1)))); }); events.setFullList(); long endTime = System.currentTimeMillis(); executionTime = (endTime - startTime) / 1000; PrintUtilities.printExecutionTime( startTime, endTime, EDCoW.class.getName(), Thread.currentThread().getStackTrace()[1].getMethodName()); }