Esempio n. 1
0
  public void filterRetweets(String inputFile, String outputFile) {
    ArrayList<String> tweets = Utility.readFile(inputFile);
    ArrayList<String> filteredTweets = new ArrayList<String>();
    HashMap<String, Boolean> tweetsToBeAdded = new HashMap<String, Boolean>();

    for (String tweet : tweets) {
      tweet = tweet.toLowerCase().trim();

      if (tweet.contains("ã")
          || tweet.startsWith("rt")
          || tweet.startsWith("@")
          || tweet.startsWith("http")
          || tweet.equals("")
          || tweet.startsWith("\"rt")) continue;

      // what we have here are the tweets that survived.
      String tokens[] = tweet.split(Utility.space);
      StringBuilder sb = new StringBuilder();

      for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        token = Utility.processToken(token);
        if (token != null) {
          token = token.trim();
          sb.append(token).append(Utility.space);
        }
      }
      String t = sb.toString().trim();
      if (!tweetsToBeAdded.containsKey(t) && t.length() > 25) {
        tweetsToBeAdded.put(t, true);
      }
    }

    Iterator<Map.Entry<String, Boolean>> it = tweetsToBeAdded.entrySet().iterator();
    while (it.hasNext()) {
      Map.Entry<String, Boolean> pair = (Map.Entry<String, Boolean>) it.next();
      filteredTweets.add(pair.getKey());
    }

    Utility.writeFile(outputFile, filteredTweets);
  }