예제 #1
0
  /**
   * Returns the position of the first occurrence of substr in current string from the specified
   * position (0-based index).
   *
   * @param v the string to be searched
   * @param start the start position of the current string for searching
   * @return the position of the first occurrence of substr, if not found, -1 returned.
   */
  public int indexOf(UTF8String v, int start) {
    if (v.numBytes() == 0) {
      return 0;
    }

    // locate to the start position.
    int i = 0; // position in byte
    int c = 0; // position in character
    while (i < numBytes && c < start) {
      i += numBytesForFirstByte(getByte(i));
      c += 1;
    }

    do {
      if (i + v.numBytes > numBytes) {
        return -1;
      }
      if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) {
        return c;
      }
      i += numBytesForFirstByte(getByte(i));
      c += 1;
    } while (i < numBytes);

    return -1;
  }
예제 #2
0
 /** Find the `str` from right to left. */
 private int rfind(UTF8String str, int start) {
   assert (str.numBytes > 0);
   while (start >= 0) {
     if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
       return start;
     }
     start -= 1;
   }
   return -1;
 }
예제 #3
0
 @Override
 public boolean equals(Object other) {
   if (other instanceof UnsafeRow) {
     UnsafeRow o = (UnsafeRow) other;
     return (sizeInBytes == o.sizeInBytes)
         && ByteArrayMethods.arrayEquals(
             baseObject, baseOffset, o.baseObject, o.baseOffset, sizeInBytes);
   }
   return false;
 }
예제 #4
0
 @Override
 public boolean equals(final Object other) {
   if (other instanceof UTF8String) {
     UTF8String o = (UTF8String) other;
     if (numBytes != o.numBytes) {
       return false;
     }
     return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes);
   } else {
     return false;
   }
 }
예제 #5
0
  /*
   * Returns the index of the string `match` in this String. This string has to be a comma separated
   * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String,
   * 0 will be returned, else the index of match (1-based index)
   */
  public int findInSet(UTF8String match) {
    if (match.contains(COMMA_UTF8)) {
      return 0;
    }

    int n = 1, lastComma = -1;
    for (int i = 0; i < numBytes; i++) {
      if (getByte(i) == (byte) ',') {
        if (i - (lastComma + 1) == match.numBytes
            && ByteArrayMethods.arrayEquals(
                base, offset + (lastComma + 1), match.base, match.offset, match.numBytes)) {
          return n;
        }
        lastComma = i;
        n++;
      }
    }
    if (numBytes - (lastComma + 1) == match.numBytes
        && ByteArrayMethods.arrayEquals(
            base, offset + (lastComma + 1), match.base, match.offset, match.numBytes)) {
      return n;
    }
    return 0;
  }
예제 #6
0
  /**
   * Levenshtein distance is a metric for measuring the distance of two strings. The distance is
   * defined by the minimum number of single-character edits (i.e. insertions, deletions or
   * substitutions) that are required to change one of the strings into the other.
   */
  public int levenshteinDistance(UTF8String other) {
    // Implementation adopted from org.apache.common.lang3.StringUtils.getLevenshteinDistance

    int n = numChars();
    int m = other.numChars();

    if (n == 0) {
      return m;
    } else if (m == 0) {
      return n;
    }

    UTF8String s, t;

    if (n <= m) {
      s = this;
      t = other;
    } else {
      s = other;
      t = this;
      int swap;
      swap = n;
      n = m;
      m = swap;
    }

    int[] p = new int[n + 1];
    int[] d = new int[n + 1];
    int[] swap;

    int i, i_bytes, j, j_bytes, num_bytes_j, cost;

    for (i = 0; i <= n; i++) {
      p[i] = i;
    }

    for (j = 0, j_bytes = 0; j < m; j_bytes += num_bytes_j, j++) {
      num_bytes_j = numBytesForFirstByte(t.getByte(j_bytes));
      d[0] = j + 1;

      for (i = 0, i_bytes = 0; i < n; i_bytes += numBytesForFirstByte(s.getByte(i_bytes)), i++) {
        if (s.getByte(i_bytes) != t.getByte(j_bytes)
            || num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) {
          cost = 1;
        } else {
          cost =
              (ByteArrayMethods.arrayEquals(
                      t.base, t.offset + j_bytes, s.base, s.offset + i_bytes, num_bytes_j))
                  ? 0
                  : 1;
        }
        d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost);
      }

      swap = p;
      p = d;
      d = swap;
    }

    return p[n];
  }
예제 #7
0
 private boolean matchAt(final UTF8String s, int pos) {
   if (s.numBytes + pos > numBytes || pos < 0) {
     return false;
   }
   return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
 }