public HashSet<String> getEntitiesinTweet(String tweet) { HashSet<String> entities = new HashSet<String>(); TwitterTokenizer tweetTokenizer = new TwitterTokenizer(); for (String token : tweetTokenizer.tokenize(tweet)) { token = token.trim(); token = token.replaceAll("( [^a-zA-Z0-9\\.]) | ( [^a-zA-Z0-9\\.] ) | ([^a-zA-Z0-9\\.] )", " "); try { Pattern p = Pattern.compile("^[A-Z]+.*"); String[] split = token.split("\\s+"); for (String s : split) { s = s.trim(); if (p.matcher(s).matches() && !stopWords.contains(s.toLowerCase())) { entities.add(s); } } } catch (Exception e) { e.printStackTrace(); } for (String np : npe.extract(token)) { if (!stopWords.contains(np.trim().toLowerCase())) { entities.add(np.trim()); } } } return entities; }
public HashMap<String, Entity> getAllEntities(String handle) { HashMap<String, Entity> allEntities = new HashMap<String, Entity>(); try { BufferedReader br = new BufferedReader(new FileReader("data/" + handle + ".txt")); BufferedWriter bw = new BufferedWriter(new FileWriter("data/" + handle + "_entities.txt")); BufferedWriter bw1 = new BufferedWriter(new FileWriter("data/" + handle + "_statistics.txt")); String line = ""; Counter<String> nPhraseCounter = new Counter<String>(); Counter<String> capitalsCounter = new Counter<String>(); while ((line = br.readLine()) != null) { line = line.replaceAll("RT", ""); TwitterTokenizer tweetTokenizer = new TwitterTokenizer(); for (String token : tweetTokenizer.tokenize(line)) { token = token.trim(); token = token.replaceAll( "( [^a-zA-Z0-9\\.]) | ( [^a-zA-Z0-9\\.] ) | ([^a-zA-Z0-9\\.] )", " "); ArrayList<String> nPhrases = new ArrayList<String>(); HashSet<String> capitalWords = new HashSet<String>(); try { Pattern p = Pattern.compile("^[A-Z]+.*"); String[] split = token.split("\\s+"); for (String s : split) { if (p.matcher(s).matches() && !stopWords.contains(s.toLowerCase())) { capitalWords.add(s.toLowerCase()); capitalsCounter.incrementCount(s.toLowerCase(), 1.0); if (allEntities.containsKey(s.trim())) { Entity e = allEntities.get(s.trim()); if (!e.tweets.contains(line)) { e.tweets.add(line); allEntities.put(s.trim(), e); } } else { Entity e = new Entity(s.trim()); e.tweets.add(line); allEntities.put(s.trim(), e); } } } } catch (Exception e) { e.printStackTrace(); } bw.write("===============================================\n"); bw.write(token + "\n"); System.out.println("token: " + token); for (String np : npe.extract(token)) { if (!stopWords.contains(np.trim().toLowerCase())) { nPhrases.add(np.trim()); nPhraseCounter.incrementCount(np.trim(), 1.0); if (allEntities.containsKey(np.trim())) { Entity e = allEntities.get(np.trim()); if (!e.tweets.contains(line)) { e.tweets.add(line); allEntities.put(np.trim(), e); } } else { Entity e = new Entity(np.trim()); e.tweets.add(line); allEntities.put(np.trim(), e); } } } bw.write("===============================================\n"); bw.write("Noun-Phrases: " + nPhrases.toString() + "\n"); // HashSet<String> capitalWords = // getCapitalizedWords(token); if (capitalWords == null) { bw.write("No capitals\n\n"); } else { bw.write("Capitals: " + capitalWords.toString() + "\n\n"); } } bw.flush(); if (true) continue; } PriorityQueue<String> nPhraseQueue = nPhraseCounter.asPriorityQueue(); PriorityQueue<String> capitalQueue = capitalsCounter.asPriorityQueue(); while (nPhraseQueue.hasNext()) { String np = nPhraseQueue.next(); bw1.write(np + " " + nPhraseCounter.getCount(np) + "\n"); } bw1.write("=========================================================\n"); while (capitalQueue.hasNext()) { String cap = capitalQueue.next(); bw1.write(cap + " " + capitalsCounter.getCount(cap) + "\n"); } bw1.flush(); } catch (Exception e) { e.printStackTrace(); } return allEntities; }