private char getCharCDT(char[] data, CodeTracker cdt) { char c = getChar(data[cdt.offset], cdt.g0, cdt.g1); if (translateNCR && c == '&' && data.length > cdt.offset + 8) { String tmp = new String(data, cdt.offset, 8); if (tmp.matches("&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];")) { c = getCharFromCodePoint(tmp.substring(3, 7)); cdt.offset += 8; } else { cdt.offset++; } } else { cdt.offset++; } return (c); }
private void set_cdt( CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte) { if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E') { addnlOffset++; } else if (data[cdt.offset + addnlOffset] == ' ') { if (errorList != null) { errorList.addError( ErrorHandler.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space."); } else { throw new MarcException( "Extraneous space character found within MARC8 character set escape sequence"); } addnlOffset++; } else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character."); } else { throw new MarcException( "Extraneaous intermediate character found following escape character."); } addnlOffset++; } if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1) { cdt.offset += 1; cdt.multibyte = false; if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } } else // All is well, proceed normally { if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset]; else cdt.g1 = data[cdt.offset + addnlOffset]; cdt.offset += 1 + addnlOffset; cdt.multibyte = multibyte; } }
private String convertMultibyte(CodeTracker cdt, char[] data) { StringBuffer sb = new StringBuffer(); int offset = cdt.offset; while (offset < data.length && data[offset] != 0x1b) { int length = getRawMBLength(data, offset); int spaces = getNumSpacesInMBLength(data, offset); boolean errorsPresent = false; if ((length - spaces) % 3 != 0) errorsPresent = true; // if a 0x20 byte occurs amidst a sequence of multibyte characters // skip over it and output a space. if (data[offset] == 0x20) { sb.append(' '); offset++; } else if (errorsPresent == false && offset + 3 <= data.length && (errorList == null || data[offset + 1] != 0x20 && data[offset + 2] != 0x20) && getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])) != 0) { char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (errorList == null || c != 0) { sb.append(c); offset += 3; } } else if (offset + 6 < data.length && data[offset + 4] != 0x20 && (getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) == 0 || getMBChar(makeMultibyte(data[offset + 3], data[offset + 4], data[offset + 5])) == 0) && getMBChar(makeMultibyte(data[offset + 2], data[offset + 3], data[offset + 4])) != 0) { String mbstr = getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], ']', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '[')) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']')); if (mbstr.length() == 1) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create the only valid option"); sb.append(mbstr); offset += 2; } else if (mbstr.length() > 1) { if (errorList != null) errorList.addError( ErrorHandler.MAJOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create a randomly chosen valid option"); sb.append(mbstr.subSequence(0, 1)); offset += 2; } else if (mbstr.length() == 0) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters"); sb.append("[?]"); offset += 3; } } else if (offset + 7 < data.length && data[offset + 4] != 0x20 && (getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) == 0 || getMBChar(makeMultibyte(data[offset + 3], data[offset + 4], data[offset + 5])) == 0) && getMBChar(makeMultibyte(data[offset + 4], data[offset + 5], data[offset + 6])) != 0) { String mbstr = getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], ']', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '[')) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']')); if (mbstr.length() == 1) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create the only valid option"); sb.append(mbstr); offset += 2; } else if (mbstr.length() > 1) { if (errorList != null) errorList.addError( ErrorHandler.MAJOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create a randomly chosen valid option"); sb.append(mbstr.subSequence(0, 1)); offset += 2; } else if (mbstr.length() == 0) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters"); sb.append("[?]"); offset += 3; } } else if (offset + 4 <= data.length && data[offset] > 0x7f && getMBChar(makeMultibyte(data[offset + 1], data[offset + 2], data[offset + 3])) != 0) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters"); sb.append(getChar(data[offset], 0x42, 0x45)); offset += 1; } } else if (errorList != null && offset + 4 <= data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) { int multiByte = makeMultibyte( data[offset], ((data[offset + 1] != 0x20) ? data[offset + 1] : data[offset + 2]), data[offset + 3]); char c = getMBChar(multiByte); if (c != 0) { if (errorList != null) { errorList.addError( ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character"); } sb.append(c); sb.append(' '); offset += 4; } else { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } } else if (offset + 3 > data.length || offset + 3 == data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } else if (offset + 3 <= data.length && getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) != 0) { char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (errorList == null || c != 0) { sb.append(c); offset += 3; } } else { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } } cdt.offset = offset; return (sb.toString()); }
/** * Converts MARC-8 data to UCS/Unicode. * * @param data - the MARC-8 data in an array of char * @return String - the UCS/Unicode data */ public String convert(char data[]) { StringBuffer sb = new StringBuffer(); int len = data.length; CodeTracker cdt = new CodeTracker(); cdt.g0 = 0x42; cdt.g1 = 0x45; cdt.multibyte = false; cdt.offset = 0; checkMode(data, cdt); Queue diacritics = new Queue(); while (cdt.offset < data.length) { if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) { while (cdt.offset < len && ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) { char c = getCharCDT(data, cdt); if (c != 0) diacritics.put(new Character(c)); checkMode(data, cdt); } if (cdt.offset >= len) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Diacritic found at the end of field, without the character that it is supposed to decorate"); break; } } char c2 = getCharCDT(data, cdt); checkMode(data, cdt); if (c2 != 0) sb.append(c2); while (!diacritics.isEmpty()) { char c1 = ((Character) diacritics.get()).charValue(); sb.append(c1); } } else if (cdt.multibyte) { String mbstr = convertMultibyte(cdt, data); sb.append(mbstr); } else { int offset = cdt.offset; char cdtchar = data[offset]; char c = getCharCDT(data, cdt); boolean greekErrorFixed = false; if (errorList != null && cdt.g0 == 0x53 && data[offset] > 0x20 && data[offset] < 0x40) { if (c == 0 && data[offset] > 0x20 && data[offset] < 0x40) { errorList.addError( ErrorHandler.MINOR_ERROR, "Unknown punctuation mark found in Greek character set, inserting change to default character set"); cdt.g0 = 0x42; // change to default character set c = getChar(data[offset], cdt.g0, cdt.g1); if (c != 0) { sb.append(c); greekErrorFixed = true; } } else if (offset + 1 < data.length && data[offset] >= '0' && data[offset] <= '9' && data[offset + 1] >= '0' && data[offset + 1] <= '9') { errorList.addError( ErrorHandler.MINOR_ERROR, "Unlikely sequence of punctuation mark found in Greek character set, it likely a number, inserting change to default character set"); cdt.g0 = 0x42; // change to default character set char c1 = getChar(data[offset], cdt.g0, cdt.g1); if (c1 != 0) { sb.append(c1); greekErrorFixed = true; } } } if (!greekErrorFixed && c != 0) sb.append(c); else if (!greekErrorFixed && c == 0) { String val = "0000" + Integer.toHexString((int) (cdtchar)); sb.append("<U+" + (val.substring(val.length() - 4, val.length())) + ">"); } } if (hasNext(cdt.offset, len)) { checkMode(data, cdt); } } String dataElement = sb.toString(); if (translateNCR && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*")) { Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);"); Matcher matcher = pattern.matcher(dataElement); StringBuffer newElement = new StringBuffer(); int prevEnd = 0; while (matcher.find()) { newElement.append(dataElement.substring(prevEnd, matcher.start())); newElement.append(getCharFromCodePoint(matcher.group(1))); prevEnd = matcher.end(); } newElement.append(dataElement.substring(prevEnd)); dataElement = newElement.toString(); } return (dataElement); }
private void checkMode(char[] data, CodeTracker cdt) { int extra = 0; int extra2 = 0; while (cdt.offset + extra + extra2 < data.length && isEscape(data[cdt.offset])) { if (cdt.offset + extra + extra2 + 1 == data.length) { cdt.offset += 1; if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Escape character found at end of field, discarding it."); } else { throw new MarcException("Escape character found at end of field"); } break; } switch (data[cdt.offset + 1 + extra]) { case 0x28: // '(' case 0x2c: // ',' set_cdt(cdt, 0, data, 2 + extra, false); break; case 0x29: // ')' case 0x2d: // '-' set_cdt(cdt, 1, data, 2 + extra, false); break; case 0x24: // '$' if (!loadedMultibyte) { loadMultibyte(); loadedMultibyte = true; } switch (data[cdt.offset + 2 + extra + extra2]) { case 0x29: // ')' case 0x2d: // '-' set_cdt(cdt, 1, data, 3 + extra + extra2, true); break; case 0x2c: // ',' set_cdt(cdt, 0, data, 3 + extra + extra2, true); break; case 0x31: // '1' cdt.g0 = data[cdt.offset + 2 + extra + extra2]; cdt.offset += 3 + extra + extra2; cdt.multibyte = true; break; case 0x20: // ' ' // space found in escape code: look ahead and try to proceed extra2++; break; default: // unknown code character found: discard escape sequence and return cdt.offset += 1; if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException( "Unknown character set code found following escape character."); } break; } break; case 0x67: // 'g' case 0x62: // 'b' case 0x70: // 'p' cdt.g0 = data[cdt.offset + 1 + extra]; cdt.offset += 2 + extra; cdt.multibyte = false; break; case 0x73: // 's' cdt.g0 = 0x42; cdt.offset += 2 + extra; cdt.multibyte = false; break; case 0x20: // ' ' // space found in escape code: look ahead and try to proceed if (errorList == null) { throw new MarcException( "Extraneous space character found within MARC8 character set escape sequence"); } extra++; break; default: // unknown code character found: discard escape sequence and return cdt.offset += 1; if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } break; } } if (errorList != null && (extra != 0 || extra2 != 0)) { errorList.addError( ErrorHandler.ERROR_TYPO, "" + (extra + extra2) + " extraneous space characters found within MARC8 character set escape sequence"); } }