/** * @param args the command line arguments * @throws java.lang.Exception */ public static void main(String[] args) throws Exception { // TODO code application logic here String dataDir; if (args.length == 0) { System.err.println( "ERROR: incorrect parameters for eecs.oregonstate.edu.preproccessing.preproccessing! " + "only " + args.length + " parameters :-("); System.err.println("Usage: java -jar PatentSearch.jar [options]"); System.err.println("[options] have to be defined in the following order:"); System.err.println("[-dataDir]: directory of the data"); System.err.println("[-outDir]: output directory."); } else { // args = new String[1]; // args[0] = "/Users/rbouadjenek/Documents/SocialMediaAnalysis/dataset/"; dataDir = args[0]; AddingIds pre = new AddingIds(args[1], Integer.parseInt(args[2])); long start = System.currentTimeMillis(); pre.process(dataDir); long end = System.currentTimeMillis(); long millis = (end - start); System.err.println( "------------------------------------------------------------------------"); System.err.println( "There are " + pre.getNbrDocs() + " documents processed in " + Functions.getTimer(millis) + "."); System.err.println( "------------------------------------------------------------------------"); } }
/** * This method recursively process the directory folder * * @param dataDir directory of the data to be preprocessed * @throws java.lang.Exception */ public void process(String dataDir) throws Exception { TextFilesFilter filter = new TextFilesFilter(); File f = new File(dataDir); File[] listFiles = f.listFiles(); for (File listFile : listFiles) { if (listFile.isDirectory()) { process(listFile.toString()); } else { if (!listFile.isHidden() && listFile.exists() && listFile.canRead() && filter.accept(listFile)) { nbrDocs++; long start = System.currentTimeMillis(); File unZippedFile = new File(listFile.getAbsolutePath().replaceFirst("[.][^.]+$", "")); unZipIt(listFile, unZippedFile); int total = parseFile(unZippedFile); long end = System.currentTimeMillis(); long millis = (end - start); System.err.println( "A total of " + total + " lines were parsed from the file " + listFile.getName() + " in " + Functions.getTimer(millis) + "."); unZippedFile.delete(); } } } }