예제 #1
0
  /**
   * Checks if a category belongs to the domain defined by the given vocabulary. A category belongs
   * to one domain if its title contains at least one word which is included in the domain
   * vocabulary.
   *
   * @param category The category to check.
   * @param vocabulary The vocabulary of the desired domain.
   * @return {@code true} if the category belongs to the domain. Otherwise, {@code false}.
   * @throws WikiTitleParsingException
   */
  public boolean isDomain(Category category, DomainVocabulary vocabulary)
      throws WikiTitleParsingException {
    CHK.CHECK_NOT_NULL(category);
    boolean isDomain = false;
    String title = category.getTitle().getPlainTitle();
    Iterator<String> iterator = vocabulary.preprocess(title).iterator();
    while (iterator.hasNext() && !isDomain) {
      String term = iterator.next();
      isDomain = vocabulary.contains(term);
    }

    return isDomain;
  }
예제 #2
0
  public static void main(String args[]) throws IOException {
    String VOC_OUT_FILE = "%s_%s_%s.vocabulary";
    CommandLine cLine = parseArguments(args);
    Locale language = new Locale(cLine.getOptionValue("l"));
    int year = Integer.valueOf(cLine.getOptionValue("y"));
    CategoryExplorer explorer = new CategoryExplorer(language, year);
    Category cat = null;
    DomainVocabulary vocabulary = new DomainVocabulary(explorer.getLocale());
    ArrayList<GroupOfCategories> groupsList = null;
    int step = 1;
    if (cLine.hasOption("c")
        && cLine.hasOption("t")
        && cLine.hasOption("s")
        && cLine.hasOption("v")) {
      step = 2;
      String category = cLine.getOptionValue("c");
      cat = explorer.loadCategory(Integer.parseInt(category));
      File input = new File(cLine.getOptionValue("v"));
      vocabulary.insertFromFile(input);
    }
    if (cLine.hasOption("k") && cLine.hasOption("s")) {
      step = 3;
      // TODO init groupsList with k file
    }

    switch (step) {
      case 1:
        // Step 1: Create domain vocabulary
        logger.info("Step 1.");
        String category = cLine.getOptionValue("c");
        cat = explorer.loadCategory(Integer.parseInt(category));
        if (cat == null) {
          // TODO
        }
        try {
          vocabulary = explorer.createCategoryVocabulary(cat);
        } catch (WikiApiException e) {
          logger.errorEnd("Exception during Wikipedia DB access:" + e.getLocalizedMessage());
          e.printStackTrace();
        }
        String filename = String.format(VOC_OUT_FILE, category, language, year);
        File vocabularyFile = new File(System.getProperty("user.dir"), filename);
        vocabulary.toFile(vocabularyFile);
      case 2:
        // Step 2: Explore Wikipedia categories with the most frequent terms
        // of the vocabulary
        logger.info("Step 2.");
        String top = cLine.getOptionValue("t");
        vocabulary = vocabulary.getTop(Float.parseFloat(top));
        groupsList = explorer.scoreDomain(vocabulary, cat);
        // TODO Save groups
      case 3:
        // Extract the categories which belong to the domain
        logger.info("Step 3.");
        String score = cLine.getOptionValue("s");
        HashSet<Category> domain =
            (HashSet<Category>) explorer.getDomainCategories(groupsList, Double.parseDouble(score));
        for (Category domainCat : domain) {
          try {
            System.out.println(
                domainCat.getPageId() + "\t" + domainCat.getTitle().getWikiStyleTitle());
          } catch (WikiTitleParsingException e) {
            logger.error("The category has no title assigned" + e.getLocalizedMessage());
            e.printStackTrace();
          }
        }
    }
    logger.info("End of the process.");
  }