예제 #1
0
  public void indexTweet(Tweet tweet) {

    this.connect();

    DBObject dbObject =
        BasicDBObjectBuilder.start()
            // .add("_id", status.getId())
            .add("id", tweet.getId())
            .add("text", tweet.getText())
            .add("created_at", tweet.getCreatedAt())
            .add("hashtags", tweet.getHashTags())
            .add("userId", tweet.getUserId())
            .add("userName", tweet.getUserName())
            .add("userScreenName", tweet.getUserScreenName())
            .add("coordinates", tweet.getCoordinates())
            .get();

    // WriteConcern wc = WriteConcern.UNACKNOWLEDGED;
    WriteConcern wc = WriteConcern.ACKNOWLEDGED;
    wc = wc.continueOnError(true);

    WriteResult writeResult = this.dbCollection.insert(dbObject, wc);
  }
예제 #2
0
  public List<Tweet> readFile(String fileLocation, int limit, String fromTweetId) {

    long startTime = System.currentTimeMillis();

    int errors = 0;
    int added = 0;
    int ignored = 0;
    int skipped = 0;

    List<Tweet> tweets = new ArrayList<Tweet>();

    try {

      // Read gzFile
      InputStream inputStream = new GZIPInputStream(new FileInputStream(fileLocation));
      Reader decoder = new InputStreamReader(inputStream);
      BufferedReader br = new BufferedReader(decoder);

      String status;

      while ((status = br.readLine()) != null) {

        // Ignore garbage and log output lines, all statuses start with JSON bracket
        if (!status.equals("") && status.charAt(0) == '{') {
          try {
            JSONObject jsonObject = new JSONObject(status);

            // We use created_at as an indicator that this is a Tweet.
            if (jsonObject.has("created_at")) {

              Status statusObject = TwitterObjectFactory.createStatus(status);

              Tweet tweet = this.getTweetObjectFromStatus(statusObject);

              if (fromTweetId != null) {

                if (fromTweetId.equals(tweet.getId())) {
                  this.log.write("StatusFileReader - Scanner pickup from " + fromTweetId);
                  fromTweetId = null;
                } else {
                  skipped++;
                }

                continue;
              }

              added++;
              tweets.add(tweet);

              if (limit > 0 && added >= limit) {
                break;
              }

            } else {
              ignored++;
            }

          } catch (JSONException e) {
            this.log.write(
                "Exception in StatusFileReader: Json Parse Failure on: "
                    + status
                    + ", "
                    + e.getMessage());
          }
        } else {
          ignored++;
        }
      }

      br.close();
      decoder.close();
      inputStream.close();

    } catch (Exception e) {
      this.log.write(
          "Exception in StatusFileReader: Error Reading File: "
              + e.getClass().getName()
              + ": "
              + e.getMessage());
    }

    long runTimeSeconds = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - startTime);

    double ops = ((double) added / (double) Math.max(runTimeSeconds, 1));

    this.log.write(
        "StatusFileReader - "
            + fileLocation
            + " processed in "
            + runTimeSeconds
            + "s. "
            + added
            + " ok / "
            + errors
            + " errors / "
            + ignored
            + " ignored / "
            + skipped
            + " skipped. "
            + ops
            + " ops. Limit: "
            + limit
            + ", Fetch: "
            + tweets.size());

    return tweets;
  }