/** * Returns the position of the first occurrence of substr in current string from the specified * position (0-based index). * * @param v the string to be searched * @param start the start position of the current string for searching * @return the position of the first occurrence of substr, if not found, -1 returned. */ public int indexOf(UTF8String v, int start) { if (v.numBytes() == 0) { return 0; } // locate to the start position. int i = 0; // position in byte int c = 0; // position in character while (i < numBytes && c < start) { i += numBytesForFirstByte(getByte(i)); c += 1; } do { if (i + v.numBytes > numBytes) { return -1; } if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) { return c; } i += numBytesForFirstByte(getByte(i)); c += 1; } while (i < numBytes); return -1; }
/** Find the `str` from right to left. */ private int rfind(UTF8String str, int start) { assert (str.numBytes > 0); while (start >= 0) { if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) { return start; } start -= 1; } return -1; }
@Override public boolean equals(Object other) { if (other instanceof UnsafeRow) { UnsafeRow o = (UnsafeRow) other; return (sizeInBytes == o.sizeInBytes) && ByteArrayMethods.arrayEquals( baseObject, baseOffset, o.baseObject, o.baseOffset, sizeInBytes); } return false; }
@Override public boolean equals(final Object other) { if (other instanceof UTF8String) { UTF8String o = (UTF8String) other; if (numBytes != o.numBytes) { return false; } return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes); } else { return false; } }
/* * Returns the index of the string `match` in this String. This string has to be a comma separated * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String, * 0 will be returned, else the index of match (1-based index) */ public int findInSet(UTF8String match) { if (match.contains(COMMA_UTF8)) { return 0; } int n = 1, lastComma = -1; for (int i = 0; i < numBytes; i++) { if (getByte(i) == (byte) ',') { if (i - (lastComma + 1) == match.numBytes && ByteArrayMethods.arrayEquals( base, offset + (lastComma + 1), match.base, match.offset, match.numBytes)) { return n; } lastComma = i; n++; } } if (numBytes - (lastComma + 1) == match.numBytes && ByteArrayMethods.arrayEquals( base, offset + (lastComma + 1), match.base, match.offset, match.numBytes)) { return n; } return 0; }
/** * Levenshtein distance is a metric for measuring the distance of two strings. The distance is * defined by the minimum number of single-character edits (i.e. insertions, deletions or * substitutions) that are required to change one of the strings into the other. */ public int levenshteinDistance(UTF8String other) { // Implementation adopted from org.apache.common.lang3.StringUtils.getLevenshteinDistance int n = numChars(); int m = other.numChars(); if (n == 0) { return m; } else if (m == 0) { return n; } UTF8String s, t; if (n <= m) { s = this; t = other; } else { s = other; t = this; int swap; swap = n; n = m; m = swap; } int[] p = new int[n + 1]; int[] d = new int[n + 1]; int[] swap; int i, i_bytes, j, j_bytes, num_bytes_j, cost; for (i = 0; i <= n; i++) { p[i] = i; } for (j = 0, j_bytes = 0; j < m; j_bytes += num_bytes_j, j++) { num_bytes_j = numBytesForFirstByte(t.getByte(j_bytes)); d[0] = j + 1; for (i = 0, i_bytes = 0; i < n; i_bytes += numBytesForFirstByte(s.getByte(i_bytes)), i++) { if (s.getByte(i_bytes) != t.getByte(j_bytes) || num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) { cost = 1; } else { cost = (ByteArrayMethods.arrayEquals( t.base, t.offset + j_bytes, s.base, s.offset + i_bytes, num_bytes_j)) ? 0 : 1; } d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost); } swap = p; p = d; d = swap; } return p[n]; }
private boolean matchAt(final UTF8String s, int pos) { if (s.numBytes + pos > numBytes || pos < 0) { return false; } return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes); }