@Test
  public void testWhiteSpacesInSingleSegment() throws URISyntaxException {
    ITextUnit tu1 = new TextUnit("tu1");
    TextContainer source = tu1.getSource();
    source.append(new Segment("seg1", new TextFragment("  Text ")));

    assertEquals(1, source.getSegments().count());
    assertEquals(1, source.count());

    assertEquals("  Text ", source.get(0).toString());
    assertTrue(source.get(0).isSegment());

    params.setSegmentationStrategy(SegmStrategy.DEEPEN_EXISTING);
    segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1));

    assertEquals(1, source.getSegments().count());
    assertEquals(3, source.count());

    assertEquals("  ", source.get(0).toString());
    assertFalse(source.get(0).isSegment());

    assertEquals("Text", source.get(1).toString());
    assertTrue(source.get(1).isSegment());

    assertEquals(" ", source.get(2).toString());
    assertFalse(source.get(2).isSegment());
  }
Exemple #2
0
 private void testNoBreak(
     String text, String bbr, String abr, String beforeBreak, String afterBreak) {
   rules.clear();
   rules.add(new Rule(bbr, abr, true));
   doc.addLanguageRule("default", rules);
   doc.addLanguageMap(new LanguageMap(".*", "default"));
   segmenter.setLanguage(null); // Force rules recompile
   doc.compileLanguageRules(LocaleId.ENGLISH, segmenter);
   assertEquals(1, segmenter.computeSegments(text));
   TextContainer tc = new TextContainer(text);
   tc.getSegments().create(segmenter.getRanges());
   assertEquals(text, tc.getSegments().get(0).toString());
 }
Exemple #3
0
  protected static long count(
      Class<? extends BaseCounter> classRef, Object text, LocaleId language) {

    if (text == null) return 0L;
    if (Util.isNullOrEmpty(language)) return 0L;

    if (text instanceof ITextUnit) {
      ITextUnit tu = (ITextUnit) text;

      //			if (tu.hasTarget(language))
      //				return count(classRef, tu.getTarget(language), language);
      //			else
      // Only words in the source are counted
      return count(classRef, tu.getSource(), language);
    } else if (text instanceof Segment) {
      Segment seg = (Segment) text;
      return count(classRef, seg.getContent(), language);
    } else if (text instanceof TextContainer) {
      // This work on segments' content (vs. parts' content)
      TextContainer tc = (TextContainer) text;
      long res = 0;
      for (Segment seg : tc.getSegments()) {
        res += count(classRef, seg, language);
      }
      return res;
    } else if (text instanceof TextFragment) {
      TextFragment tf = (TextFragment) text;

      return count(classRef, TextUnitUtil.getText(tf), language);
    } else if (text instanceof String) {
      instantiateCounter(classRef);
      if (counter == null) return 0L;

      return counter.doCount((String) text, language);
    }

    return 0;
  }
  @Test
  public void testSegmentationStrategy() {
    ITextUnit tu1 = new TextUnit("tu1");
    TextContainer source = tu1.getSource();
    source.append(new Segment("seg1", new TextFragment("Sentence1. Sentence2.")));
    source.append(new TextPart(" Text part 1. "));
    source.append(new Segment("seg1", new TextFragment("Sentence3.")));

    assertEquals(2, source.getSegments().count());

    assertEquals("Sentence1. Sentence2.", source.get(0).toString());
    assertTrue(source.get(0).isSegment());

    assertEquals(" Text part 1. ", source.get(1).toString());
    assertFalse(source.get(1).isSegment());

    assertEquals("Sentence3.", source.get(2).toString());
    assertTrue(source.get(2).isSegment());

    // 1
    params.setSegmentationStrategy(SegmStrategy.KEEP_EXISTING);
    segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1));

    // Check if not changed
    assertEquals(2, source.getSegments().count());
    assertEquals(3, source.count());

    assertEquals("Sentence1. Sentence2.", source.get(0).toString());
    assertTrue(source.get(0).isSegment());

    assertEquals(" Text part 1. ", source.get(1).toString());
    assertFalse(source.get(1).isSegment());

    assertEquals("Sentence3.", source.get(2).toString());
    assertTrue(source.get(2).isSegment());

    // 2
    params.setSegmentationStrategy(SegmStrategy.DEEPEN_EXISTING);
    segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1));

    // Check if did change
    assertEquals(3, source.getSegments().count());
    assertEquals(5, source.count());

    assertEquals("Sentence1.", source.get(0).toString());
    assertTrue(source.get(0).isSegment());

    assertEquals(" ", source.get(1).toString());
    assertFalse(source.get(1).isSegment());

    assertEquals("Sentence2.", source.get(2).toString());
    assertTrue(source.get(2).isSegment());

    assertEquals(" Text part 1. ", source.get(3).toString());
    assertFalse(source.get(3).isSegment());

    assertEquals("Sentence3.", source.get(4).toString());
    assertTrue(source.get(4).isSegment());

    // 3
    params.setSegmentationStrategy(SegmStrategy.OVERWRITE_EXISTING);
    segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1));

    // Check if did change
    assertEquals(4, source.getSegments().count());
    assertEquals(7, source.count());

    assertEquals("Sentence1.", source.get(0).toString());
    assertTrue(source.get(0).isSegment());

    assertEquals(" ", source.get(1).toString());
    assertFalse(source.get(1).isSegment());

    assertEquals("Sentence2.", source.get(2).toString());
    assertTrue(source.get(2).isSegment());

    assertEquals(" ", source.get(3).toString());
    assertFalse(source.get(3).isSegment());

    assertEquals("Text part 1.", source.get(4).toString());
    assertTrue(source.get(4).isSegment());

    assertEquals(" ", source.get(5).toString());
    assertFalse(source.get(5).isSegment());

    assertEquals("Sentence3.", source.get(6).toString());
    assertTrue(source.get(6).isSegment());
  }