public void filterRetweets2(String fileName) {
    ArrayList<String> tweets = Utility.readFile(fileName);
    ArrayList<String> filteredTweets = new ArrayList<String>();
    HashMap<String, Boolean> tweetsToBeAdded = new HashMap<String, Boolean>();

    for (String tweet : tweets) {
      tweet = tweet.toLowerCase().trim();

      /*if(!Utility.isEnglish(tweet))
      {
      	System.out.println("Ignoring tweet: ["+tweet+"], since I don't think it's english...");
      	continue;
      }*/

      /*if(tweet.contains("ã")||tweet.startsWith("@")||tweet.startsWith("http")||tweet.startsWith("rt") || tweet.equals("")
      		||tweet.startsWith("\"rt"))
      	continue;


      //what we have here are the tweets that survived.

      String tokens [] = tweet.split(Utility.space);
      StringBuilder sb = new StringBuilder();

      for(int i=0;i<tokens.length;i++)
      {
      	String token = tokens[i];
      	token = Utility.processToken(token);
      	if(token != null)
      	{
      		token = token.trim();
      		sb.append(token).append(Utility.space);
      	}
      }*/

      // String t = sb.toString().trim();
      String t = tweet.toLowerCase().trim();

      // if(!tweetsToBeAdded.containsKey(t) && t.length() >25)
      if (!tweet.contains("#sarcasm")) {
        tweetsToBeAdded.put(t, true);
      }
    }

    Iterator<Map.Entry<String, Boolean>> it = tweetsToBeAdded.entrySet().iterator();
    while (it.hasNext()) {
      Map.Entry<String, Boolean> pair = (Map.Entry<String, Boolean>) it.next();
      filteredTweets.add(pair.getKey());
    }

    Utility.writeFile("files/RandomOutput", filteredTweets);
  }
Beispiel #2
0
 private static Optional<String> getRepositoryIdFromJson(Path p) {
   try {
     String repoId =
         new Model(
                 (SerializableModel)
                     new Gson()
                         .fromJson(
                             Utility.readFile(String.valueOf(p.toAbsolutePath())).get(),
                             new TypeToken<SerializableModel>() {}.getType()))
             .getRepoId();
     if (String.valueOf(p.getFileName()).equalsIgnoreCase(escapeRepoName(repoId))) {
       logger.info("Adding " + p.getFileName() + " to stored repository list. ");
       return Optional.of(repoId);
     }
   } catch (NullPointerException | JsonParseException e) {
     logger.error("Unable to load repository from " + p.getFileName());
   }
   return Optional.empty();
 }
  public void filterRetweets(String inputFile, String outputFile) {
    ArrayList<String> tweets = Utility.readFile(inputFile);
    ArrayList<String> filteredTweets = new ArrayList<String>();
    HashMap<String, Boolean> tweetsToBeAdded = new HashMap<String, Boolean>();

    for (String tweet : tweets) {
      tweet = tweet.toLowerCase().trim();

      if (tweet.contains("ã")
          || tweet.startsWith("rt")
          || tweet.startsWith("@")
          || tweet.startsWith("http")
          || tweet.equals("")
          || tweet.startsWith("\"rt")) continue;

      // what we have here are the tweets that survived.
      String tokens[] = tweet.split(Utility.space);
      StringBuilder sb = new StringBuilder();

      for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        token = Utility.processToken(token);
        if (token != null) {
          token = token.trim();
          sb.append(token).append(Utility.space);
        }
      }
      String t = sb.toString().trim();
      if (!tweetsToBeAdded.containsKey(t) && t.length() > 25) {
        tweetsToBeAdded.put(t, true);
      }
    }

    Iterator<Map.Entry<String, Boolean>> it = tweetsToBeAdded.entrySet().iterator();
    while (it.hasNext()) {
      Map.Entry<String, Boolean> pair = (Map.Entry<String, Boolean>) it.next();
      filteredTweets.add(pair.getKey());
    }

    Utility.writeFile(outputFile, filteredTweets);
  }