Exemplo n.º 1
0
  @SuppressWarnings("unchecked")
  @Override
  public List<String> getClusterByCarrot2(String query) {
    // TODO Auto-generated method stub
    List<String> strs = new ArrayList<String>();
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);
    final List<org.carrot2.core.Document> documents = Lists.newArrayList();
    try {
      q = getParser().parse(QueryParserUtil.escape(query));
      docs = getIndexSearcher().search(q, Integer.MAX_VALUE);
      hits = docs.scoreDocs;
      for (int i = 0; i < hits.length; i++) {
        Document doc = getIndexSearcher().doc(hits[i].doc);
        documents.add(
            new org.carrot2.core.Document(
                doc.get(CONTENTS_FIELD), doc.get(TITLE_FIELD), doc.get(USER_FIELD)));
      }
      final ProcessingResult byTopicClusters =
          controller.process(documents, query, LingoClusteringAlgorithm.class);
      final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
      final ProcessingResult byDomainClusters =
          controller.process(documents, query, ByUrlClusteringAlgorithm.class);
      final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
      for (Cluster c : clustersByDomain) {
        strs.add(c.getLabel());
      }
      for (Cluster c : clustersByTopic) {
        strs.add(c.getLabel());
      }
    } catch (Exception ex) {

    }
    return strs;
  }
Exemplo n.º 2
0
 @SuppressWarnings("unchecked")
 @Override
 public List<String> getClusterByCarrotVersion2(String query) {
   // TODO Auto-generated method stub
   List<String> strs = new ArrayList<String>();
   final Controller controller = ControllerFactory.createPooling();
   final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>();
   LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes).directory(directory);
   SimpleFieldMapperDescriptor.attributeBuilder(luceneGlobalAttributes)
       .titleField(TITLE_FIELD)
       .contentField(CONTENTS_FIELD)
       .searchFields(Arrays.asList(new String[] {TITLE_FIELD, CONTENTS_FIELD}));
   controller.init(
       new HashMap<String, Object>(),
       new ProcessingComponentConfiguration(
           LuceneDocumentSource.class, "lucene", luceneGlobalAttributes));
   final Map<String, Object> processingAttributes = Maps.newHashMap();
   CommonAttributesDescriptor.attributeBuilder(processingAttributes).query(query);
   ProcessingResult process =
       controller.process(
           processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName());
   for (Cluster c : process.getClusters()) {
     strs.add(c.getLabel() + " >>>> " + c.getAllDocuments().size());
   }
   return strs;
 }
Exemplo n.º 3
0
  @UsesExternalServices
  @Test
  public void testRequestIndependence() {
    @SuppressWarnings("unchecked")
    final Controller controller =
        ControllerFactory.createCachingPooling(org.carrot2.core.IDocumentSource.class);
    closeAfterTest(controller);

    final Map<String, Object> attrs = Maps.newHashMap();

    CommonAttributesDescriptor.attributeBuilder(attrs).results(50).query("data mining");

    controller.process(
        attrs, org.carrot2.webapp.source.WebDocumentSource.class, LingoClusteringAlgorithm.class);

    attrs.clear();
    CommonAttributesDescriptor.attributeBuilder(attrs)
        .results(50)
        .query(WebDocumentSource.QUERY_FAILURE);

    try {
      controller.process(
          attrs, org.carrot2.webapp.source.WebDocumentSource.class, LingoClusteringAlgorithm.class);
      fail();
    } catch (ProcessingException e) {
      assertThat(e.getCause().getMessage()).contains("Synthetic failure");
    }
  }
Exemplo n.º 4
0
  public static void main(String[] args) throws IOException {
    /*
     * We will use the CachingController for this example. Running
     * LuceneDocumentSource within the CachingController will let us open the index
     * once per component initialization and not once per query, which would be the
     * case with SimpleController. We will also use this opportunity to show how
     * component-specific attribute values can be passed during CachingComponent
     * initialization.
     */

    /*
     * Create a caching controller that will reuse processing component instances, but
     * will not perform any caching of results produced by components. We will leave
     * caching of documents from Lucene index to Lucene and the operating system
     * caches.
     */
    final Controller controller = ControllerFactory.createPooling();

    /*
     * Prepare a map with component-specific attributes. Here, this map will contain
     * the index location and names of fields to be used to fetch document title and
     * summary.
     */
    final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>();

    String indexPath = "put your index path here or pass as the first argument";
    if (args.length == 1) {
      indexPath = args[0];
    }

    LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes)
        .directory(FSDirectory.open(new File(indexPath)));

    /*
     * Specify fields providing data inside your Lucene index.
     */
    SimpleFieldMapperDescriptor.attributeBuilder(luceneGlobalAttributes)
        .titleField("title")
        .contentField("snippet")
        .searchFields(Arrays.asList(new String[] {"titleField", "fullContent"}));

    /*
     * Initialize the controller passing the above attributes as component-specific
     * for Lucene. The global attributes map will be empty. Note that we've provided
     * an identifier for our specially-configured Lucene component, we'll need to use
     * this identifier when performing processing.
     */
    controller.init(
        new HashMap<String, Object>(),
        new ProcessingComponentConfiguration(
            LuceneDocumentSource.class, "lucene", luceneGlobalAttributes));

    /*
     * Perform processing.
     */
    String query = "mining";
    final Map<String, Object> processingAttributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(processingAttributes).query(query);

    /*
     * We need to refer to the Lucene component by its identifier we set during
     * initialization. As we've not assigned any identifier to the
     * LingoClusteringAlgorithm we want to use, we can its fully qualified class name.
     */
    ProcessingResult process =
        controller.process(
            processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName());

    ConsoleFormatter.displayResults(process);
  }
  /** Entry point. */
  public static void main(String[] args) throws IOException {
    /*
     * We will use the CachingController for this example. Running
     * LuceneDocumentSource within the CachingController will let us open the index
     * once per component initialization and not once per query, which would be the
     * case with SimpleController. We will also use this opportunity to show how
     * component-specific attribute values can be passed during CachingComponent
     * initialization.
     */

    /*
     * Create a caching controller that will reuse processing component instances, but
     * will not perform any caching of results produced by components. We will leave
     * caching of documents from Lucene index to Lucene and the operating system
     * caches.
     */
    final Controller controller = ControllerFactory.createPooling();

    /*
     * Prepare a map with component-specific attributes. Here, this map will contain
     * the index location and names of fields to be used to fetch document title and
     * summary.
     */
    final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>();

    String indexPath = "put your index path here or pass as the first argument";
    if (args.length == 1) {
      indexPath = args[0];
    }

    // Sanity check.
    if (!new File(indexPath).isDirectory()) {
      System.err.println("Index directory does not exist: " + indexPath);
      return;
    }

    LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes)
        .directory(FSDirectory.open(new File(indexPath)));

    /*
     * In ClusteringDataFromLucene we used a simple configuration of
     * LuceneDocumentSource whereby we only provided the names of Lucene fields to be
     * used for titles and summaries. If more advanced mapping of Lucene documents is
     * required, you can implement your own version of IFieldMapper as below.
     *
     * Note that we could also provide here an instance of the mapper rather than
     * its class. The differences are summarized below:
     *
     * > Class: Class has to have a no-parameter constructor. Instances of the
     *   class will not be shared between processing threads, which means the
     *   implementation does not have to be thread-safe. Recommended in most
     *   situations unless the instances are expensive to create.
     *
     * > Instance: The provided instance will be shared across processing threads,
     *   which means the implementation MUST be thread-safe.
     */
    LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes)
        .fieldMapper(new CustomFieldMapper());

    /*
     * The Analyzer used by Lucene while searching can also be provided via factory
     * because it does not have a parameterless constructor.
     */
    LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes)
        .analyzer(StandardAnalyzerFactory.class);

    /*
     * Initialize the controller passing the above attributes as component-specific
     * for Lucene. The global attributes map will be empty. Note that we've provided
     * an identifier for our specially-configured Lucene component, we'll need to use
     * this identifier when performing processing.
     */
    controller.init(
        new HashMap<String, Object>(),
        new ProcessingComponentConfiguration(
            LuceneDocumentSource.class, "lucene", luceneGlobalAttributes));

    /*
     * Perform processing.
     */
    final String query = "mining";
    final Map<String, Object> processingAttributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(processingAttributes).query(query);

    /*
     * We need to refer to the Lucene component by its identifier we set during
     * initialization. As we've not assigned any identifier to the
     * LingoClusteringAlgorithm we want to use, we can its fully qualified class name.
     */
    ProcessingResult process =
        controller.process(
            processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName());

    ConsoleFormatter.displayResults(process);
  }
Exemplo n.º 6
0
  @Test
  public void checkBasicAuthAccess() throws Throwable {
    final Server server = new Server();
    final SelectChannelConnector connector = new SelectChannelConnector();
    connector.setPort(/* any */ 0);
    connector.setReuseAddress(false);
    connector.setSoLingerTime(0);
    server.addConnector(connector);

    HashLoginService loginService = new HashLoginService();
    loginService.putUser("username", new Password("userpass"), new String[] {"role1", "role2"});

    final CountDownLatch latch = new CountDownLatch(1);

    WebAppContext wac = new WebAppContext();
    wac.getSecurityHandler().setLoginService(loginService);
    wac.setContextPath("/");

    connector.addLifeCycleListener(
        new ListenerAdapter() {
          public void lifeCycleStarted(LifeCycle lc) {
            System.out.println("Started on port: " + connector.getLocalPort());
            latch.countDown();
          }

          public void lifeCycleFailure(LifeCycle lc, Throwable t) {
            System.out.println("Failure: " + t);
            latch.countDown();
          }
        });
    wac.setParentLoaderPriority(true);

    URL resource = getClass().getResource("/auth/basic/kaczynski.xml");
    assertThat(resource.toURI().getScheme()).isEqualTo("file");
    File webapp = new File(resource.toURI());
    webapp = webapp.getParentFile(); // /auth/basic
    webapp = webapp.getParentFile(); // /auth
    wac.setWar(webapp.getAbsolutePath());
    wac.setClassLoader(Thread.currentThread().getContextClassLoader());

    server.setHandler(wac);
    server.setStopAtShutdown(true);
    try {
      server.start();
      latch.await();

      System.setProperty(HttpAuthHub.USERNAME_PROPERTY, "username");
      System.setProperty(HttpAuthHub.PASSWORD_PROPERTY, "userpass");
      Controller c = ControllerFactory.createSimple();
      try {
        Map<String, Object> attrs = new HashMap<String, Object>();
        XmlDocumentSourceDescriptor.attributeBuilder(attrs)
            .xml(
                new URLResourceWithParams(
                    new URL(
                        "http://localhost:" + connector.getLocalPort() + "/basic/kaczynski.xml")));
        ProcessingResult r = c.process(attrs, XmlDocumentSource.class);

        assertThat(r.getDocuments()).hasSize(50);
      } finally {
        c.dispose();
      }
    } finally {
      server.stop();
    }
  }
  @SuppressForbidden(reason = "C2 integration (File API)")
  @Override
  protected void doStart() throws ElasticsearchException {
    try {
      Settings.Builder builder = Settings.builder();
      Path pluginConfigPath = environment.configFile().resolve(ClusteringPlugin.PLUGIN_NAME);

      if (!Files.isDirectory(pluginConfigPath)) {
        Path srcConfig = Paths.get("src/main/config");
        if (Files.isDirectory(srcConfig)) {
          // Allow running from within the IDE.
          pluginConfigPath = srcConfig;
        } else {
          throw new ElasticsearchException("Config folder missing: " + pluginConfigPath);
        }
      } else {
        logger.info("Configuration files at: {}", pluginConfigPath.toAbsolutePath());
      }

      for (String configName :
          new String[] {"config.yml", "config.yaml", "config.json", "config.properties"}) {
        try {
          Path resolved = pluginConfigPath.resolve(configName);
          if (resolved != null && Files.exists(resolved)) {
            builder.loadFromPath(resolved);
          }
        } catch (NoClassDefFoundError e) {
          logger.warn("Could not parse: {}", e, configName);
        }
      }
      Settings c2Settings = builder.build();

      // Parse suite descriptors with loggers turned off (shut them up a bit).
      final Path suitePath = pluginConfigPath.resolve(c2Settings.get(DEFAULT_SUITE_PROPERTY_NAME));
      if (!Files.isRegularFile(suitePath)) {
        throw new ElasticsearchException(
            "Could not find algorithm suite: " + suitePath.toAbsolutePath().normalize());
      }

      final ResourceLookup suiteLookup =
          new ResourceLookup(new DirLocator(suitePath.getParent().toFile()));
      final IResource suiteResource = suiteLookup.getFirst(suitePath.getFileName().toString());

      final List<String> failed = Lists.newArrayList();
      final ProcessingComponentSuite suite =
          LoggerUtils.quietCall(
              new Callable<ProcessingComponentSuite>() {
                public ProcessingComponentSuite call() throws Exception {
                  ProcessingComponentSuite suite =
                      ProcessingComponentSuite.deserialize(suiteResource, suiteLookup);
                  for (ProcessingComponentDescriptor desc : suite.removeUnavailableComponents()) {
                    failed.add(desc.getId());
                    if (isNoClassDefFound(desc.getInitializationFailure())) {
                      logger.debug("Algorithm not available on classpath: {}", desc.getId());
                    } else {
                      logger.warn(
                          "Algorithm initialization failed: {}",
                          desc.getInitializationFailure(),
                          desc.getId());
                    }
                  }
                  return suite;
                }
              },
              Logger.getLogger(ProcessingComponentDescriptor.class),
              Logger.getLogger(ReflectionUtils.class));

      algorithms = Lists.newArrayList();
      for (ProcessingComponentDescriptor descriptor : suite.getAlgorithms()) {
        algorithms.add(descriptor.getId());
      }
      algorithms = Collections.unmodifiableList(algorithms);

      if (!algorithms.isEmpty()) {
        logger.info("Available clustering components: {}", Joiner.on(", ").join(algorithms));
      }
      if (!failed.isEmpty()) {
        logger.info("Unavailable clustering components: {}", Joiner.on(", ").join(failed));
      }

      final Path resourcesPath =
          pluginConfigPath
              .resolve(c2Settings.get(DEFAULT_RESOURCES_PROPERTY_NAME, "."))
              .toAbsolutePath()
              .normalize();

      logger.info("Lexical resources dir: {}", resourcesPath);

      final ResourceLookup resourceLookup =
          new ResourceLookup(
              new DirLocator(resourcesPath.toFile()),
              new ClassLoaderLocator(ControllerSingleton.class.getClassLoader()));

      // Change the default resource lookup to include the configured location.
      Map<String, Object> c2SettingsAsMap = Maps.newHashMap();
      DefaultLexicalDataFactoryDescriptor.attributeBuilder(c2SettingsAsMap)
          .resourceLookup(resourceLookup);
      c2SettingsAsMap.putAll(c2Settings.getAsMap());

      // Set up the license for Lingo3G, if it's available.
      Path lingo3gLicense = scanForLingo3GLicense(environment, pluginConfigPath);
      if (lingo3gLicense != null && Files.isReadable(lingo3gLicense)) {
        c2SettingsAsMap.put("license", new FileResource(lingo3gLicense.toFile()));
      } else if (algorithms.contains("lingo3g")) {
        logger.warn(
            "Lingo3G is on classpath, but no licenses have been found. Check out the documentation.");
      }

      // Create component pool.
      Integer poolSize = c2Settings.getAsInt(DEFAULT_COMPONENT_SIZE_PROPERTY_NAME, 0);
      if (poolSize > 0) {
        controller = ControllerFactory.createPooling(poolSize);
      } else {
        controller = ControllerFactory.createPooling();
      }
      controller.init(c2SettingsAsMap, suite.getComponentConfigurations());
    } catch (Exception e) {
      throw new ElasticsearchException("Could not start Carrot2 controller.", e);
    }

    if (algorithms == null || algorithms.isEmpty()) {
      throw new ElasticsearchException(
          "No registered/ available clustering algorithms? Check the logs, it's odd.");
    }
  }