public static void reducePagelinksByArrayList( String input, String output, ArrayList<DataObject> map) { FileInputStream inputStream = null; Scanner sc = null; try { inputStream = new FileInputStream(input); sc = new Scanner(inputStream, "UTF-8"); long counter = 0; String line = null; String id = null; String[] p = null; String title = null; String ids = null; String element = null; String allIds = null; long founded = 0; List<String> instance = new ArrayList<String>(); DataObject dob; while (sc.hasNextLine() /* && counter < 1000000 */) { counter++; if (counter % 100000 == 0) { System.out.println("at line " + counter); } line = sc.nextLine(); p = line.split(" "); title = p[0]; dob = getDataObject(title, map); if (dob == null) { line = null; ids = null; element = null; p = null; continue; } else { founded++; if (founded % 1000 == 0) { System.out.println("founded: "); } String k = dob.getTitle(); // ids = createStringFromIDs(p, allIds); element = k + "" + ids; // replace title to id instance.add(element); line = null; ids = null; allIds = null; element = null; k = null; dob = null; title = null; p = null; } if (instance.size() > 100000) { System.out.println("Writing"); PagelinksParser.writeFileList(output, instance); // instance.clear(); instance = null; instance = new ArrayList<String>(); System.out.println( "founded: " + founded + " line: " + counter + " mapsize " + map.size()); } if (founded % 30000 == 0) { System.out.println("Deleting system"); System.gc(); } } if (instance.size() > 0) { PagelinksParser.writeFileList(output, instance); instance.clear(); System.gc(); } } catch (Exception e) { e.printStackTrace(); } }
public static void reducePagelinksByHashMap(String input, String output) { /* * this is the main method of joining/reducing job * by the hashmap we replace the namespace#nameOfPage in pagelinks and write it into new file */ System.out.println("Starting reduce job"); ArrayList<String> list = new ArrayList<String>(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(input)); } catch (FileNotFoundException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { String line = null; String[] p = null; String titleAlfa = null; String titleNumeric = null; String ids = null; String all = null; int c = 0; System.out.println("opened input stream"); showMemoryUsage(); Pattern pattern = Pattern.compile("^[^\\s]+"); long counter = 0; Matcher matcher; long start; long readLine = 0; long matcherTime = 0; long mapContainsTime = 0; long mapGetTime = 0; long splitTime = 0; long end; while ((line = br.readLine()) != null) { start = Main.startTime(); // time computing - only for benchmarking // line = sc.nextLine(); end = Main.endTime(start); readLine += end; // System.out.println("sc.next> " + end); start = Main.startTime(); matcher = pattern.matcher(line); // match the pattern if (matcher.find()) { titleAlfa = matcher.group(0); // get namespace#nameOfPage } end = Main.endTime(start); matcherTime += end; // System.out.println("matcher> " + end); counter++; // System.out.println(titleAlfa); start = Main.startTime(); if (!map.containsKey(titleAlfa)) { // if does not contain the key continue /*line = null; p = null; titleNumeric = null; titleAlfa = null;*/ // System.out.println("map.contains> " + end); continue; } end = Main.endTime(start); mapContainsTime += end; start = Main.startTime(); titleNumeric = map.get(titleAlfa); // get ID from map by namespace#nameOfPage represented by titleAlfa end = Main.endTime(start); mapGetTime += end; // System.out.println("map.get> " + end); start = Main.startTime(); p = line.split("^[^\\s]+"); ids = p[1]; all = titleNumeric + ids; // join ID of page and IDs that pointing to it end = Main.endTime(start); splitTime += end; list.add(all); line = null; p = null; titleNumeric = null; titleAlfa = null; ids = null; all = null; if (list.size() > 10000) { // write into file // showMemoryUsage(); /*System.out.println("readline " + readLine); System.out.println("matcherTime " + matcherTime); System.out.println("mapContainsTime " + mapContainsTime); System.out.println("mapGetTime " +mapGetTime); System.out.println("splitTime " +splitTime);*/ // this comments above - uncoment if you wanna know // hom much time operations take readLine = 0; matcherTime = 0; mapContainsTime = 0; mapGetTime = 0; splitTime = 0; c++; System.out.println("save" + " " + c + " line" + counter); start = Main.startTime(); PagelinksParser.writeFileList(output, list); end = Main.endTime(start); System.out.println("Write> " + end); start = Main.startTime(); list.clear(); end = Main.endTime(start); System.out.println("Clear " + end); // list = null; // list = new ArrayList<String>(); // System.gc(); // list.trimToSize(); } if (counter % 500000 == 0) { // System.gc(); } } if (list.size() > 0) { PagelinksParser.writeFileList(output, list); } } catch (Exception e) { // TODO: handle exception } finally { // sc.close(); } }