private void set_cdt( CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte) { if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E') { addnlOffset++; } else if (data[cdt.offset + addnlOffset] == ' ') { if (errorList != null) { errorList.addError( ErrorHandler.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space."); } else { throw new MarcException( "Extraneous space character found within MARC8 character set escape sequence"); } addnlOffset++; } else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character."); } else { throw new MarcException( "Extraneaous intermediate character found following escape character."); } addnlOffset++; } if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1) { cdt.offset += 1; cdt.multibyte = false; if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } } else // All is well, proceed normally { if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset]; else cdt.g1 = data[cdt.offset + addnlOffset]; cdt.offset += 1 + addnlOffset; cdt.multibyte = multibyte; } }
private String convertMultibyte(CodeTracker cdt, char[] data) { StringBuffer sb = new StringBuffer(); int offset = cdt.offset; while (offset < data.length && data[offset] != 0x1b) { int length = getRawMBLength(data, offset); int spaces = getNumSpacesInMBLength(data, offset); boolean errorsPresent = false; if ((length - spaces) % 3 != 0) errorsPresent = true; // if a 0x20 byte occurs amidst a sequence of multibyte characters // skip over it and output a space. if (data[offset] == 0x20) { sb.append(' '); offset++; } else if (errorsPresent == false && offset + 3 <= data.length && (errorList == null || data[offset + 1] != 0x20 && data[offset + 2] != 0x20) && getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])) != 0) { char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (errorList == null || c != 0) { sb.append(c); offset += 3; } } else if (offset + 6 < data.length && data[offset + 4] != 0x20 && (getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) == 0 || getMBChar(makeMultibyte(data[offset + 3], data[offset + 4], data[offset + 5])) == 0) && getMBChar(makeMultibyte(data[offset + 2], data[offset + 3], data[offset + 4])) != 0) { String mbstr = getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], ']', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '[')) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']')); if (mbstr.length() == 1) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create the only valid option"); sb.append(mbstr); offset += 2; } else if (mbstr.length() > 1) { if (errorList != null) errorList.addError( ErrorHandler.MAJOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create a randomly chosen valid option"); sb.append(mbstr.subSequence(0, 1)); offset += 2; } else if (mbstr.length() == 0) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters"); sb.append("[?]"); offset += 3; } } else if (offset + 7 < data.length && data[offset + 4] != 0x20 && (getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) == 0 || getMBChar(makeMultibyte(data[offset + 3], data[offset + 4], data[offset + 5])) == 0) && getMBChar(makeMultibyte(data[offset + 4], data[offset + 5], data[offset + 6])) != 0) { String mbstr = getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], ']', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '[')) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']')); if (mbstr.length() == 1) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create the only valid option"); sb.append(mbstr); offset += 2; } else if (mbstr.length() > 1) { if (errorList != null) errorList.addError( ErrorHandler.MAJOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting one to create a randomly chosen valid option"); sb.append(mbstr.subSequence(0, 1)); offset += 2; } else if (mbstr.length() == 0) { if (errorList != null) errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters"); sb.append("[?]"); offset += 3; } } else if (offset + 4 <= data.length && data[offset] > 0x7f && getMBChar(makeMultibyte(data[offset + 1], data[offset + 2], data[offset + 3])) != 0) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters"); sb.append(getChar(data[offset], 0x42, 0x45)); offset += 1; } } else if (errorList != null && offset + 4 <= data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) { int multiByte = makeMultibyte( data[offset], ((data[offset + 1] != 0x20) ? data[offset + 1] : data[offset + 2]), data[offset + 3]); char c = getMBChar(multiByte); if (c != 0) { if (errorList != null) { errorList.addError( ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character"); } sb.append(c); sb.append(' '); offset += 4; } else { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } } else if (offset + 3 > data.length || offset + 3 == data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } else if (offset + 3 <= data.length && getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) != 0) { char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (errorList == null || c != 0) { sb.append(c); offset += 3; } } else { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } } cdt.offset = offset; return (sb.toString()); }
/** * Converts MARC-8 data to UCS/Unicode. * * @param data - the MARC-8 data in an array of char * @return String - the UCS/Unicode data */ public String convert(char data[]) { StringBuffer sb = new StringBuffer(); int len = data.length; CodeTracker cdt = new CodeTracker(); cdt.g0 = 0x42; cdt.g1 = 0x45; cdt.multibyte = false; cdt.offset = 0; checkMode(data, cdt); Queue diacritics = new Queue(); while (cdt.offset < data.length) { if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) { while (cdt.offset < len && ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) { char c = getCharCDT(data, cdt); if (c != 0) diacritics.put(new Character(c)); checkMode(data, cdt); } if (cdt.offset >= len) { if (errorList != null) { errorList.addError( ErrorHandler.MINOR_ERROR, "Diacritic found at the end of field, without the character that it is supposed to decorate"); break; } } char c2 = getCharCDT(data, cdt); checkMode(data, cdt); if (c2 != 0) sb.append(c2); while (!diacritics.isEmpty()) { char c1 = ((Character) diacritics.get()).charValue(); sb.append(c1); } } else if (cdt.multibyte) { String mbstr = convertMultibyte(cdt, data); sb.append(mbstr); } else { int offset = cdt.offset; char cdtchar = data[offset]; char c = getCharCDT(data, cdt); boolean greekErrorFixed = false; if (errorList != null && cdt.g0 == 0x53 && data[offset] > 0x20 && data[offset] < 0x40) { if (c == 0 && data[offset] > 0x20 && data[offset] < 0x40) { errorList.addError( ErrorHandler.MINOR_ERROR, "Unknown punctuation mark found in Greek character set, inserting change to default character set"); cdt.g0 = 0x42; // change to default character set c = getChar(data[offset], cdt.g0, cdt.g1); if (c != 0) { sb.append(c); greekErrorFixed = true; } } else if (offset + 1 < data.length && data[offset] >= '0' && data[offset] <= '9' && data[offset + 1] >= '0' && data[offset + 1] <= '9') { errorList.addError( ErrorHandler.MINOR_ERROR, "Unlikely sequence of punctuation mark found in Greek character set, it likely a number, inserting change to default character set"); cdt.g0 = 0x42; // change to default character set char c1 = getChar(data[offset], cdt.g0, cdt.g1); if (c1 != 0) { sb.append(c1); greekErrorFixed = true; } } } if (!greekErrorFixed && c != 0) sb.append(c); else if (!greekErrorFixed && c == 0) { String val = "0000" + Integer.toHexString((int) (cdtchar)); sb.append("<U+" + (val.substring(val.length() - 4, val.length())) + ">"); } } if (hasNext(cdt.offset, len)) { checkMode(data, cdt); } } String dataElement = sb.toString(); if (translateNCR && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*")) { Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);"); Matcher matcher = pattern.matcher(dataElement); StringBuffer newElement = new StringBuffer(); int prevEnd = 0; while (matcher.find()) { newElement.append(dataElement.substring(prevEnd, matcher.start())); newElement.append(getCharFromCodePoint(matcher.group(1))); prevEnd = matcher.end(); } newElement.append(dataElement.substring(prevEnd)); dataElement = newElement.toString(); } return (dataElement); }