Ejemplo n.º 1
0
  @GET
  @Path("/getWordCloud/json/{fields}/{sources}/{keyword}")
  @Produces(MediaType.APPLICATION_JSON)
  public String getWordCloud(
      @PathParam("keyword") String keyword,
      @PathParam("fields") String fields,
      @PathParam("sources") String sources) {
    SolrDBManager db = new SolrDBManager();
    JSONArray tfjson = null;
    ArrayList<String> searchField = formSearchField(fields);
    ArrayList<String> S = new ArrayList<String>();
    if (sources.contains("video")) S.add("Video");
    if (sources.contains("news")) S.add("NewsArticle");

    try {

      int maxNumOfWordsToDisplay =
          Integer.parseInt(db.conf.getProperty("visualization_MaxWordCloudSize"));
      int maxNumOfEvents = Integer.parseInt(db.conf.getProperty("visualization_MaxDocForClouds"));

      StoryDistribution distr = db.getDistribution(keyword, S, searchField, maxNumOfEvents);
      tfjson = distr.getTermFrequencies(maxNumOfWordsToDisplay);

      System.out.println("Finish generating wordcloud");

    } catch (Exception e) {
      e.printStackTrace();
    } finally {

    }
    return tfjson.toString();
  }
 @Override
 protected ArrayList<String> parseMonthPageForDatesOnly(Document doc) {
   ArrayList<String> dates = new ArrayList<String>();
   String query = "div#content.mw-body div#bodyContent div#mw-content-text.mw-content-ltr";
   for (int i = 1; i <= 31; i++) {
     query = query + " div#" + i + "_May_2005";
     Elements days = doc.select(query);
     for (Element eachday : days) { // This will loop only once because it is the whole text
       String actualDate = null;
       String modifiedDate =
           eachday.attr(
               "id"); // This is essential to do because wikipedia present dates in weird manner
                      // and if we want to faciliate search using dates in our database then they
                      // should be present in this format YYYY-MM-DD
       int firstoccur = modifiedDate.indexOf("_");
       String year = modifiedDate.substring(firstoccur + 5, firstoccur + 9);
       String day = modifiedDate.substring(0, 2).replace('_', ' ').trim();
       if (day.length() == 1) {
         day = "0" + day;
       }
       actualDate = year + "-05-" + day;
       try {
         Date.valueOf(actualDate);
         dates.add(actualDate);
       } catch (Exception ex) {
         ex.printStackTrace();
         System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
         continue;
       }
     }
   }
   return dates;
 }
Ejemplo n.º 3
0
 @GET
 @Path("/getNews/json/{fields}/{sources}/{keyword}")
 @Produces(MediaType.APPLICATION_JSON)
 public List<Event> getNews(
     @PathParam("keyword") String keyword,
     @PathParam("fields") String fields,
     @PathParam("sources") String sources) {
   SolrDBManager db = new SolrDBManager();
   List<Event> events = new ArrayList<Event>();
   ArrayList<String> searchField = formSearchField(fields); // search field
   // Debug
   ArrayList<String> S = new ArrayList<String>();
   if (sources.contains("video")) S.add("Video");
   if (sources.contains("news")) S.add("NewsArticle");
   try {
     int maxNumOfEventsToDisplay =
         Integer.parseInt(db.conf.getProperty("visualization_MaxTimelineSize"));
     events = db.searchByKeyword(keyword, S, searchField, maxNumOfEventsToDisplay);
   } catch (Exception e) {
     e.printStackTrace();
   }
   return events;
 }
  protected ArrayList<Event> parseMonthPage(Document doc) {

    ArrayList<Event> events = new ArrayList<Event>();

    String query = "div#content.mw-body div#bodyContent div#mw-content-text.mw-content-ltr";
    for (int i = 1; i <= 31; i++) {
      query = query + " div#" + i + "_May_2005";
      Elements days = doc.select(query);
      for (Element eachday : days) { // This will loop only once because it is the whole text
        String actualDate = null;
        String modifiedDate =
            eachday.attr(
                "id"); // This is essential to do because wikipedia present dates in weird manner
                       // and if we want to faciliate search using dates in our database then they
                       // should be present in this format YYYY-MM-DD
        int firstoccur = modifiedDate.indexOf("_");
        String year = modifiedDate.substring(firstoccur + 5, firstoccur + 9);
        String day = modifiedDate.substring(0, 2).replace('_', ' ').trim();
        if (day.length() == 1) {
          day = "0" + day;
        }
        actualDate = year + "-05-" + day;
        try {
          Date.valueOf(actualDate);
        } catch (Exception ex) {
          ex.printStackTrace();
          System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
          continue;
        }

        Elements individual = eachday.children();
        for (Element dateplustext :
            individual) { // This consists of alternate date and events (with or withour newsStory)
          if (dateplustext.tagName().equals("ul")) {
            // Complete news under a given date
            Elements stories =
                dateplustext
                    .children(); // This contains different stories (newsStory may be present or
                                 // not)
            for (Element li : stories) {
              Elements uls =
                  li
                      .children(); // These are either <a> tags if it doesn't have a newsStory or it
                                   // is <a> and <ul> tag if it contains a newsStory
              boolean hasUL = false;
              for (Element ul : uls) {
                if (ul.tagName()
                    .equals("ul")) { // If li has ul then it implies that it contains a news story
                  hasUL = true; // news story is there
                  Node storyNode =
                      li.childNode(
                          0); // this the story .. it is used later at the end for each event
                  Elements eventsNodes =
                      ul
                          .children(); // Now we get inside the ul element which containd different
                                       // li elements
                  for (Element eventNode : eventsNodes) { // Here we are picking one li
                    Event event = extractDescriptionAndLinks(eventNode);
                    try {
                      event.setDate(Date.valueOf(actualDate));
                    } catch (Exception ex) {
                      ex.printStackTrace();
                      System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
                      continue;
                    }
                    // News story
                    if (!storyNode.attr("title").isEmpty() && !storyNode.attr("href").isEmpty()) {
                      if (isValidWikiURL(storyNode.attr("href"))) {
                        Story story = new Story();
                        // story.setName(st.attr("title"));
                        story.setName(getEntityName(storyNode.attr("href")));
                        story.setWikipediaUrl(getEntityURL(storyNode.attr("href")));
                        event.setStory(story);
                      }
                    }
                    events.add(event);
                  }
                }
              }
              if (!hasUL) { // event does not have a story
                Event event = extractDescriptionAndLinks(li);
                try {
                  event.setDate(Date.valueOf(actualDate));
                } catch (Exception ex) {
                  ex.printStackTrace();
                  System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
                  continue;
                }
                events.add(event);
              }
            }
          }
        }
      }
    }
    return events;
  }