コード例 #1
0
  public static void reducePagelinksByArrayList(
      String input, String output, ArrayList<DataObject> map) {

    FileInputStream inputStream = null;
    Scanner sc = null;
    try {
      inputStream = new FileInputStream(input);
      sc = new Scanner(inputStream, "UTF-8");
      long counter = 0;
      String line = null;
      String id = null;
      String[] p = null;
      String title = null;
      String ids = null;
      String element = null;
      String allIds = null;
      long founded = 0;
      List<String> instance = new ArrayList<String>();
      DataObject dob;
      while (sc.hasNextLine() /* && counter < 1000000 */) {
        counter++;
        if (counter % 100000 == 0) {
          System.out.println("at line " + counter);
        }
        line = sc.nextLine();
        p = line.split(" ");
        title = p[0];
        dob = getDataObject(title, map);
        if (dob == null) {
          line = null;
          ids = null;
          element = null;
          p = null;
          continue;
        } else {
          founded++;
          if (founded % 1000 == 0) {
            System.out.println("founded: ");
          }
          String k = dob.getTitle();
          // ids = createStringFromIDs(p, allIds);
          element = k + "" + ids; // replace title to id
          instance.add(element);
          line = null;
          ids = null;
          allIds = null;
          element = null;
          k = null;
          dob = null;
          title = null;
          p = null;
        }
        if (instance.size() > 100000) {
          System.out.println("Writing");
          PagelinksParser.writeFileList(output, instance);
          // instance.clear();
          instance = null;
          instance = new ArrayList<String>();

          System.out.println(
              "founded: " + founded + " line: " + counter + " mapsize " + map.size());
        }
        if (founded % 30000 == 0) {
          System.out.println("Deleting system");
          System.gc();
        }
      }
      if (instance.size() > 0) {
        PagelinksParser.writeFileList(output, instance);
        instance.clear();
        System.gc();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
コード例 #2
0
  public static void reducePagelinksByHashMap(String input, String output) {
    /*
     * this is the main method of joining/reducing job
     *  by the hashmap we replace the namespace#nameOfPage in pagelinks and write it into new file
     */
    System.out.println("Starting reduce job");
    ArrayList<String> list = new ArrayList<String>();
    BufferedReader br = null;
    try {
      br = new BufferedReader(new FileReader(input));
    } catch (FileNotFoundException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }

    try {
      String line = null;
      String[] p = null;
      String titleAlfa = null;
      String titleNumeric = null;
      String ids = null;
      String all = null;
      int c = 0;
      System.out.println("opened input stream");
      showMemoryUsage();
      Pattern pattern = Pattern.compile("^[^\\s]+");
      long counter = 0;
      Matcher matcher;
      long start;
      long readLine = 0;
      long matcherTime = 0;
      long mapContainsTime = 0;
      long mapGetTime = 0;
      long splitTime = 0;

      long end;
      while ((line = br.readLine()) != null) {

        start = Main.startTime(); // time computing - only for benchmarking
        // line = sc.nextLine();
        end = Main.endTime(start);
        readLine += end;
        // System.out.println("sc.next> " + end);

        start = Main.startTime();
        matcher = pattern.matcher(line); // match the pattern

        if (matcher.find()) {
          titleAlfa = matcher.group(0); // get namespace#nameOfPage
        }
        end = Main.endTime(start);
        matcherTime += end;
        // System.out.println("matcher> " + end);
        counter++;
        // System.out.println(titleAlfa);

        start = Main.startTime();
        if (!map.containsKey(titleAlfa)) { // if does not contain the key continue
          /*line = null;
          p = null;
          titleNumeric = null;
          titleAlfa = null;*/

          //	System.out.println("map.contains> " + end);
          continue;
        }
        end = Main.endTime(start);
        mapContainsTime += end;
        start = Main.startTime();
        titleNumeric =
            map.get(titleAlfa); // get ID from map by namespace#nameOfPage represented by titleAlfa
        end = Main.endTime(start);
        mapGetTime += end;
        // System.out.println("map.get> " + end);

        start = Main.startTime();
        p = line.split("^[^\\s]+");
        ids = p[1];
        all = titleNumeric + ids; // join ID of page and IDs that pointing to it
        end = Main.endTime(start);
        splitTime += end;

        list.add(all);

        line = null;
        p = null;
        titleNumeric = null;
        titleAlfa = null;
        ids = null;
        all = null;
        if (list.size() > 10000) { // write into file
          // showMemoryUsage();
          /*System.out.println("readline " + readLine);
          System.out.println("matcherTime " + matcherTime);
          System.out.println("mapContainsTime " + mapContainsTime);
          System.out.println("mapGetTime " +mapGetTime);
          System.out.println("splitTime " +splitTime);*/
          // this comments above - uncoment if you wanna know
          // hom much time operations take
          readLine = 0;
          matcherTime = 0;
          mapContainsTime = 0;
          mapGetTime = 0;
          splitTime = 0;
          c++;
          System.out.println("save" + " " + c + " line" + counter);
          start = Main.startTime();
          PagelinksParser.writeFileList(output, list);
          end = Main.endTime(start);
          System.out.println("Write> " + end);

          start = Main.startTime();
          list.clear();
          end = Main.endTime(start);
          System.out.println("Clear " + end);
          // list = null;
          // list = new ArrayList<String>();
          // System.gc();
          // list.trimToSize();
        }
        if (counter % 500000 == 0) {
          // System.gc();
        }
      }
      if (list.size() > 0) {
        PagelinksParser.writeFileList(output, list);
      }

    } catch (Exception e) {
      // TODO: handle exception
    } finally {
      // sc.close();

    }
  }