/** * Utility function to get the probabilities of the languages likely * * @param text String containing the text which is to be detected * @return ArrayList<Language> containing the list of matching languages * @throws LangDetectException */ public ArrayList<Language> detectLangs(String text) throws LangDetectException { Detector detector = DetectorFactory.create(); detector.append(text); return detector.getProbabilities(); }
private String detectLanguage(String aDocumentText) throws AnalysisEngineProcessException { String language = "x-unspecified"; try { Detector detector = DetectorFactory.create(); detector.append(aDocumentText); language = detector.detect(); } catch (LangDetectException e) { // "no features in text" might occur if a message composes for instance of a single // numeric value // we silently ignore this particular error message, but throw all other if (!isFeatureException(e)) { throw new AnalysisEngineProcessException(e); } } return language; }
/** * Function to detect the language of a given string * * @param text String containing the text whose language is to be detected * @return Language code of the language in which the text is in (for e.g. "en" for english) * @throws LangDetectException */ public static String detect(String text) throws LangDetectException { if (text == null) { return ""; } Detector detector = DetectorFactory.create(); detector.append(text); String s = ""; try { s = detector.detect(); } catch (Exception e) { return ""; } return s; }
private static ArrayList<String> ProcessTimeLine(String user) throws InterruptedException, TwitterException { ArrayList<String> Tweets = new ArrayList<String>(); ConfigurationBuilder cb = new ConfigurationBuilder(); cb.setDebugEnabled(true) .setOAuthConsumerKey(KEY) .setOAuthConsumerSecret(SECRET) .setOAuthAccessToken(ACCESSTOKEN) .setOAuthAccessTokenSecret(ACCESSSECRET); cb.setJSONStoreEnabled(true); // gets Twitter instance with default credentials boolean bWait = true; Twitter twitter = new TwitterFactory(cb.build()).getInstance(); do { try { Map<String, RateLimitStatus> oRT = twitter.getRateLimitStatus(); RateLimitStatus rateLimit = oRT.get("/statuses/user_timeline"); int remaining = rateLimit.getRemaining(); System.out.print("(Remaining API calls: " + remaining + ")"); int remainingTime = rateLimit.getSecondsUntilReset(); if (remaining <= NUM_TWEETS / 200 + 1) { System.out.println("Waiting " + remainingTime + " seconds"); Thread.sleep(remainingTime * 1000); } else bWait = false; } catch (Exception te) { if (te.toString().toLowerCase().contains("rate limit") && !te.toString().toLowerCase().contains("bad authentication data")) { System.out.println("Waiting 60s"); Thread.sleep(60 * 1000); } else { bWait = false; } } } while (bWait); try { Detector detector = DetectorFactory.create(); List<Status> statuses; int iPage = 1; int iTweets = 0; do { int iPageSize = 0; if (iTweets + 200 < NUM_TWEETS) { iPageSize = 200; } else { iPageSize = NUM_TWEETS - iTweets; } statuses = twitter.getUserTimeline(user, new Paging(iPage, iPageSize)); for (Status status : statuses) { String sStatusId = "-1"; try { if ((status.getRetweetedStatus() != null) && (status.getRetweetedStatus().getUser() != null)) { continue; } try { detector.append(Simplify(status.getText())); if (detector.detect().equalsIgnoreCase("es")) { String sStatusJSON = DataObjectFactory.getRawJSON(status); Tweets.add(sStatusJSON); } } catch (Exception exl) { } } catch (Exception ex) { System.out.println("ERROR in status id " + sStatusId); } iTweets++; } iPage++; } while (statuses.size() > 0 && iTweets < NUM_TWEETS); } catch (TwitterException te) { te.printStackTrace(); System.out.println("Failed to get timeline: " + te.getMessage()); } catch (Exception ex) { } System.out.println("..." + Tweets.size() + " tweets."); return Tweets; }
public static int searchForTwits(int id, String text, Connection conn, String getDate) throws TwitterException, SQLException, LangDetectException, ParseException { Twitter twitter = new TwitterFactory().getInstance(); int countTweets = 0; int pageNumber = 1; int n = 0; do { Query query = new Query(text).rpp(100).page(pageNumber); QueryResult result = twitter.search(query); for (Tweet tweet : result.getTweets()) { java.util.Date date = tweet.getCreatedAt(); Format formatter; formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String newDate = formatter.format(date); DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); java.util.Date oldDate = df.parse(getDate); Statement input = conn.createStatement(); countTweets++; if (date.after(oldDate)) { Detector detector = DetectorFactory.create(); detector.append(tweet.getText()); String lang; try { lang = detector.detect(); if (lang.equals("lv") || lang.equals("ru")) { try { input.executeUpdate( "INSERT INTO tweet " + "VALUES (null, '" + tweet.getId() + "', '" + tweet.getFromUser() + "', '" + tweet.getText().replace("'", "’") + "', '" + newDate + "', null, null, null)"); input.executeUpdate( "INSERT INTO tweet_brand " + "VALUES (null, '" + tweet.getId() + "', '" + id + "')"); } catch (SQLException ex) { } n++; } else continue; } catch (LangDetectException ex) { } } else continue; } pageNumber++; if (countTweets == 100) countTweets = 0; else break; } while (pageNumber != 16); System.out.print(text + " "); return n; }