/** * Loads the cached features for a given document * * @param document * @param documentFile The cache file for the document. * @return the cached features if possible. Null if a cache doesn't exist or it fails to get them. * @throws Exception */ private List<EventSet> getCachedFeatures(Document document, File documentFile) { List<EventSet> generatedEvents = null; BufferedReader reader = null; if (documentFile.exists() && !documentFile.isDirectory() && documentFile.canRead()) { try { reader = new BufferedReader(new FileReader(documentFile)); } catch (FileNotFoundException e) { // shouldn't ever get here.. just put this here so I can keep track // of exceptions below. e.printStackTrace(); } } else { return null; } try { // cachedPath is the path to the document that was used when the cache for that // document was created. cachedLastModified is the last modified time stamp on the // document that was cached. String cachedPath = reader.readLine(); long cachedLastModified = Long.parseLong(reader.readLine()); String path = document.getFilePath(); File currDoc = new File(path); long lastModified = currDoc.lastModified(); if (!(currDoc.getCanonicalPath().equals(cachedPath) && lastModified == cachedLastModified)) { // cache is invalid reader.close(); return null; } String line = null; generatedEvents = new ArrayList<EventSet>(); while ((line = reader.readLine()) != null) { if (line.isEmpty()) continue; EventSet es = new EventSet(); es.setAuthor(document.getAuthor()); es.setDocumentName(document.getTitle()); es.setEventSetID(line); String event = null; while ((event = reader.readLine()) != null) { if (line.isEmpty()) continue; if (event.equals(",")) // delimiter for event sets break; es.addEvent(new Event(event)); } generatedEvents.add(es); } reader.close(); } catch (IOException e) { e.printStackTrace(); return null; } return generatedEvents; }
/** * Extracts a list of all features to be used for analysis. * * @param culledEventSets * @param relevantEvents * @param cumulativeFeatureDriver * @return * @throws Exception */ public List<String> getFeatureList( List<List<EventSet>> culledEventSets, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metdata prior to generating attribute list ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize useful things int numOfFeatureClasses = relevantEvents.size(); List<EventSet> list; List<String> features = new ArrayList<String>(numOfFeatureClasses); // initialize list of sets of events, which will eventually become the // attributes List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses); // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) { // initialize relevant list of event sets and histograms list = new ArrayList<EventSet>(); for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i)); // initialize eventSet EventSet events = new EventSet(); events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID()); if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature // generate event histograms and unique event list EventSet eventSet = list.get(currEventSet); for (Event event : eventSet) { events.addEvent(event); } allEvents.add(events); } else { // one unique numeric event // generate sole event (give placeholder value) Event event = new Event("{-}"); events.addEvent(event); allEvents.add(events); } } // Adds all of the events to the fast vector int featureIndex = 0; for (EventSet es : allEvents) { Iterator<Event> iterator = es.iterator(); if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { if (iterator.hasNext()) { // grab first event; there should be at least one Event nextEvent = (Event) iterator.next(); // get and add all middle events if they exist while (iterator.hasNext()) { features.add(nextEvent.getEvent()); nextEvent = (Event) iterator.next(); } // add the last event features.add(nextEvent.getEvent()); } } else { features.add(es.getEventSetID()); } featureIndex++; } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return features; }
/** * Extracts the List of EventSets from a document using the provided CumulativeFeatureDriver.<br> * * @param document the document to have features extracted and made into event sets * @param cumulativeFeatureDriver the driver containing the features to be extracted and the * functionality to do so * @param loadDocContents whether or not the document contents are already loaded into the object * @return the List of EventSets for the document */ public List<EventSet> extractEventSets( Document document, CumulativeFeatureDriver cumulativeFeatureDriver, boolean loadDocContents, boolean isUsingCache) throws Exception { List<EventSet> generatedEvents = new ArrayList<EventSet>(); if (isUsingCache) { File cacheDir = new File(JSANConstants.JSAN_CACHE + "_" + cumulativeFeatureDriver.getName() + "/"); File authorDir = null; if (document.getAuthor().equals(JSANConstants.DUMMY_NAME)) { authorDir = new File(cacheDir, "you"); } else { authorDir = new File(cacheDir, "_" + document.getAuthor()); } File documentFile = new File(authorDir, document.getTitle() + ".cache"); generatedEvents = getCachedFeatures(document, documentFile); if (generatedEvents == null) { // delete the cache for this document! It is invalid documentFile.delete(); // program will continue as normal, extracting events } else { // return the cached features return generatedEvents; } } // Extract the Events from the documents try { generatedEvents = cumulativeFeatureDriver.createEventSets(document, loadDocContents, isUsingCache); } catch (Exception e) { LOG.error("Failed to extract events from a document!", e); throw e; } // create metadata event to store document information EventSet documentInfo = new EventSet(); documentInfo.setEventSetID("<DOCUMENT METADATA>"); /* * Metadata Event format: * * EventSetID: "<DOCUMENT METADATA>" Event at Index: * 0 : author * 1 : title * 2 : Sentences in document * 3 : Words in document * 4 : Characters in document * 5 : Letters in document */ // Extract document title and author Event authorEvent = new Event(document.getAuthor()); // Event titleEvent = new Event(document.getFilePath()); Event titleEvent = new Event(document.getTitle()); documentInfo.addEvent(authorEvent); documentInfo.addEvent(titleEvent); // Extract normalization baselines // Sentences in doc { Document doc = null; SingleNumericEventDriver counter = new SentenceCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num sentences from document", e); throw e; } documentInfo.addEvent(tempEvent); } // Words in doc { Document doc = null; SingleNumericEventDriver counter = new WordCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num words from document", e); throw e; } documentInfo.addEvent(tempEvent); } // Characters in doc { Document doc = null; SingleNumericEventDriver counter = new CharCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num characters from document", e); throw e; } documentInfo.addEvent(tempEvent); } // Letters in doc { Document doc = null; SingleNumericEventDriver counter = new LetterCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num letters from document", e); throw e; } documentInfo.addEvent(tempEvent); } // add the metadata EventSet to the List<EventSet> generatedEvents.add(documentInfo); // return the List<EventSet> return generatedEvents; }
/** * Goes over the culled List of Lists of EventSets and determines which events are histograms and * which have a single numerical value.<br> * Uses the information to prepare a List of EventSets to extract from the test document(s). * * @param culledEventSets The culled List of Lists of EventSets * @param cumulativeFeatureDriver The driver used to extract the EventSets * @return The List of EventSet to extract from the test document(s) * @throws Exception */ public List<EventSet> getRelevantEvents( List<List<EventSet>> culledEventSets, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metadata prior to generating the relevantEvents ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize the EventSet list List<EventSet> relevantEvents = new LinkedList<EventSet>(); // iterate over the List of Lists for (List<EventSet> l : culledEventSets) { // iterate over each inner list's eventSets int featureIndex = 0; for (EventSet esToAdd : l) { // whether or not to add the event set to the list (if false, it // is already on the list) boolean add = true; ; for (EventSet esl : relevantEvents) { // this should compare the category/name of the event set if (esToAdd.getEventSetID().equals(esl.getEventSetID())) { add = false; break; } } // this event set isn't on the list at all, just add it (which // also adds its internal events) to the list if (add) { EventSet temp = new EventSet(); temp.setEventSetID(esToAdd.getEventSetID()); // for all of the events for (Event e : esToAdd) { boolean absent = true; // check to see if it's been added yet or not for (Event ae : temp) { if (ae.getEvent().equals(e.getEvent())) { absent = false; break; } } // if it has not been added, add it if (absent) { if (!cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) temp.addEvent(new Event("{-}")); else temp.addEvent(e); } } relevantEvents.add(temp); } else { // go through this eventSet and add any events to the // relevant EventSet if they aren't already there. if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { for (Event e : esToAdd) { boolean toAdd = true; // for all events in the relecant list for (Event re : relevantEvents.get(featureIndex)) { // if it's already there, don't add it if (e.getEvent().equals(re.getEvent())) { toAdd = false; break; } } // add it if it isn't there if (toAdd) { relevantEvents.get(featureIndex).addEvent(e); } } } } featureIndex++; } } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return relevantEvents; }