예제 #1
0
 private char getCharCDT(char[] data, CodeTracker cdt) {
   char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
   if (translateNCR && c == '&' && data.length > cdt.offset + 8) {
     String tmp = new String(data, cdt.offset, 8);
     if (tmp.matches("&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];")) {
       c = getCharFromCodePoint(tmp.substring(3, 7));
       cdt.offset += 8;
     } else {
       cdt.offset++;
     }
   } else {
     cdt.offset++;
   }
   return (c);
 }
예제 #2
0
 private void set_cdt(
     CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte) {
   if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E') {
     addnlOffset++;
   } else if (data[cdt.offset + addnlOffset] == ' ') {
     if (errorList != null) {
       errorList.addError(
           ErrorHandler.ERROR_TYPO,
           "Extraneous space character found within MARC8 character set escape sequence. Skipping over space.");
     } else {
       throw new MarcException(
           "Extraneous space character found within MARC8 character set escape sequence");
     }
     addnlOffset++;
   } else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1) {
     if (errorList != null) {
       errorList.addError(
           ErrorHandler.MINOR_ERROR,
           "Extraneaous intermediate character found following escape character. Discarding intermediate character.");
     } else {
       throw new MarcException(
           "Extraneaous intermediate character found following escape character.");
     }
     addnlOffset++;
   }
   if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1) {
     cdt.offset += 1;
     cdt.multibyte = false;
     if (errorList != null) {
       errorList.addError(
           ErrorHandler.MINOR_ERROR,
           "Unknown character set code found following escape character. Discarding escape character.");
     } else {
       throw new MarcException("Unknown character set code found following escape character.");
     }
   } else // All is well, proceed normally
   {
     if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset];
     else cdt.g1 = data[cdt.offset + addnlOffset];
     cdt.offset += 1 + addnlOffset;
     cdt.multibyte = multibyte;
   }
 }
예제 #3
0
 private String convertMultibyte(CodeTracker cdt, char[] data) {
   StringBuffer sb = new StringBuffer();
   int offset = cdt.offset;
   while (offset < data.length && data[offset] != 0x1b) {
     int length = getRawMBLength(data, offset);
     int spaces = getNumSpacesInMBLength(data, offset);
     boolean errorsPresent = false;
     if ((length - spaces) % 3 != 0) errorsPresent = true;
     // if a 0x20 byte occurs amidst a sequence of multibyte characters
     // skip over it and output a space.
     if (data[offset] == 0x20) {
       sb.append(' ');
       offset++;
     } else if (errorsPresent == false
         && offset + 3 <= data.length
         && (errorList == null || data[offset + 1] != 0x20 && data[offset + 2] != 0x20)
         && getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])) != 0) {
       char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2]));
       if (errorList == null || c != 0) {
         sb.append(c);
         offset += 3;
       }
     } else if (offset + 6 < data.length
         && data[offset + 4] != 0x20
         && (getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) == 0
             || getMBChar(makeMultibyte(data[offset + 3], data[offset + 4], data[offset + 5]))
                 == 0)
         && getMBChar(makeMultibyte(data[offset + 2], data[offset + 3], data[offset + 4])) != 0) {
       String mbstr =
           getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1]))
               + getMBCharStr(makeMultibyte(data[offset], ']', data[offset + 1]))
               + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '['))
               + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']'));
       if (mbstr.length() == 1) {
         if (errorList != null)
           errorList.addError(
               ErrorHandler.MINOR_ERROR,
               "Missing square brace character in MARC8 multibyte character, inserting one to create the only valid option");
         sb.append(mbstr);
         offset += 2;
       } else if (mbstr.length() > 1) {
         if (errorList != null)
           errorList.addError(
               ErrorHandler.MAJOR_ERROR,
               "Missing square brace character in MARC8 multibyte character, inserting one to create a randomly chosen valid option");
         sb.append(mbstr.subSequence(0, 1));
         offset += 2;
       } else if (mbstr.length() == 0) {
         if (errorList != null)
           errorList.addError(
               ErrorHandler.MINOR_ERROR,
               "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters");
         sb.append("[?]");
         offset += 3;
       }
     } else if (offset + 7 < data.length
         && data[offset + 4] != 0x20
         && (getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) == 0
             || getMBChar(makeMultibyte(data[offset + 3], data[offset + 4], data[offset + 5]))
                 == 0)
         && getMBChar(makeMultibyte(data[offset + 4], data[offset + 5], data[offset + 6])) != 0) {
       String mbstr =
           getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1]))
               + getMBCharStr(makeMultibyte(data[offset], ']', data[offset + 1]))
               + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '['))
               + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']'));
       if (mbstr.length() == 1) {
         if (errorList != null)
           errorList.addError(
               ErrorHandler.MINOR_ERROR,
               "Missing square brace character in MARC8 multibyte character, inserting one to create the only valid option");
         sb.append(mbstr);
         offset += 2;
       } else if (mbstr.length() > 1) {
         if (errorList != null)
           errorList.addError(
               ErrorHandler.MAJOR_ERROR,
               "Missing square brace character in MARC8 multibyte character, inserting one to create a randomly chosen valid option");
         sb.append(mbstr.subSequence(0, 1));
         offset += 2;
       } else if (mbstr.length() == 0) {
         if (errorList != null)
           errorList.addError(
               ErrorHandler.MINOR_ERROR,
               "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters");
         sb.append("[?]");
         offset += 3;
       }
     } else if (offset + 4 <= data.length
         && data[offset] > 0x7f
         && getMBChar(makeMultibyte(data[offset + 1], data[offset + 2], data[offset + 3])) != 0) {
       if (errorList != null) {
         errorList.addError(
             ErrorHandler.MINOR_ERROR,
             "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters");
         sb.append(getChar(data[offset], 0x42, 0x45));
         offset += 1;
       }
     } else if (errorList != null
         && offset + 4 <= data.length
         && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) {
       int multiByte =
           makeMultibyte(
               data[offset],
               ((data[offset + 1] != 0x20) ? data[offset + 1] : data[offset + 2]),
               data[offset + 3]);
       char c = getMBChar(multiByte);
       if (c != 0) {
         if (errorList != null) {
           errorList.addError(
               ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character");
         }
         sb.append(c);
         sb.append(' ');
         offset += 4;
       } else {
         if (errorList != null) {
           errorList.addError(
               ErrorHandler.MINOR_ERROR,
               "Erroneous MARC8 multibyte character, inserting change to default character set");
         }
         cdt.multibyte = false;
         cdt.g0 = 0x42;
         cdt.g1 = 0x45;
         break;
       }
     } else if (offset + 3 > data.length
         || offset + 3 == data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) {
       if (errorList != null) {
         errorList.addError(
             ErrorHandler.MINOR_ERROR,
             "Partial MARC8 multibyte character, inserting change to default character set");
       }
       cdt.multibyte = false;
       cdt.g0 = 0x42;
       cdt.g1 = 0x45;
       break;
     } else if (offset + 3 <= data.length
         && getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) != 0) {
       char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2]));
       if (errorList == null || c != 0) {
         sb.append(c);
         offset += 3;
       }
     } else {
       if (errorList != null) {
         errorList.addError(
             ErrorHandler.MINOR_ERROR,
             "Erroneous MARC8 multibyte character, inserting change to default character set");
       }
       cdt.multibyte = false;
       cdt.g0 = 0x42;
       cdt.g1 = 0x45;
       break;
     }
   }
   cdt.offset = offset;
   return (sb.toString());
 }
예제 #4
0
  /**
   * Converts MARC-8 data to UCS/Unicode.
   *
   * @param data - the MARC-8 data in an array of char
   * @return String - the UCS/Unicode data
   */
  public String convert(char data[]) {
    StringBuffer sb = new StringBuffer();
    int len = data.length;

    CodeTracker cdt = new CodeTracker();

    cdt.g0 = 0x42;
    cdt.g1 = 0x45;
    cdt.multibyte = false;

    cdt.offset = 0;

    checkMode(data, cdt);

    Queue diacritics = new Queue();

    while (cdt.offset < data.length) {
      if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) {

        while (cdt.offset < len
            && ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
            && hasNext(cdt.offset, len)) {
          char c = getCharCDT(data, cdt);
          if (c != 0) diacritics.put(new Character(c));
          checkMode(data, cdt);
        }
        if (cdt.offset >= len) {
          if (errorList != null) {
            errorList.addError(
                ErrorHandler.MINOR_ERROR,
                "Diacritic found at the end of field, without the character that it is supposed to decorate");
            break;
          }
        }
        char c2 = getCharCDT(data, cdt);
        checkMode(data, cdt);
        if (c2 != 0) sb.append(c2);

        while (!diacritics.isEmpty()) {
          char c1 = ((Character) diacritics.get()).charValue();
          sb.append(c1);
        }

      } else if (cdt.multibyte) {
        String mbstr = convertMultibyte(cdt, data);
        sb.append(mbstr);
      } else {
        int offset = cdt.offset;
        char cdtchar = data[offset];
        char c = getCharCDT(data, cdt);
        boolean greekErrorFixed = false;
        if (errorList != null && cdt.g0 == 0x53 && data[offset] > 0x20 && data[offset] < 0x40) {
          if (c == 0 && data[offset] > 0x20 && data[offset] < 0x40) {
            errorList.addError(
                ErrorHandler.MINOR_ERROR,
                "Unknown punctuation mark found in Greek character set, inserting change to default character set");
            cdt.g0 = 0x42; // change to default character set
            c = getChar(data[offset], cdt.g0, cdt.g1);
            if (c != 0) {
              sb.append(c);
              greekErrorFixed = true;
            }
          } else if (offset + 1 < data.length
              && data[offset] >= '0'
              && data[offset] <= '9'
              && data[offset + 1] >= '0'
              && data[offset + 1] <= '9') {
            errorList.addError(
                ErrorHandler.MINOR_ERROR,
                "Unlikely sequence of punctuation mark found in Greek character set, it likely a number, inserting change to default character set");
            cdt.g0 = 0x42; // change to default character set
            char c1 = getChar(data[offset], cdt.g0, cdt.g1);
            if (c1 != 0) {
              sb.append(c1);
              greekErrorFixed = true;
            }
          }
        }
        if (!greekErrorFixed && c != 0) sb.append(c);
        else if (!greekErrorFixed && c == 0) {
          String val = "0000" + Integer.toHexString((int) (cdtchar));
          sb.append("<U+" + (val.substring(val.length() - 4, val.length())) + ">");
        }
      }
      if (hasNext(cdt.offset, len)) {
        checkMode(data, cdt);
      }
    }
    String dataElement = sb.toString();
    if (translateNCR
        && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*")) {
      Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);");
      Matcher matcher = pattern.matcher(dataElement);
      StringBuffer newElement = new StringBuffer();
      int prevEnd = 0;
      while (matcher.find()) {
        newElement.append(dataElement.substring(prevEnd, matcher.start()));
        newElement.append(getCharFromCodePoint(matcher.group(1)));
        prevEnd = matcher.end();
      }
      newElement.append(dataElement.substring(prevEnd));
      dataElement = newElement.toString();
    }
    return (dataElement);
  }
예제 #5
0
 private void checkMode(char[] data, CodeTracker cdt) {
   int extra = 0;
   int extra2 = 0;
   while (cdt.offset + extra + extra2 < data.length && isEscape(data[cdt.offset])) {
     if (cdt.offset + extra + extra2 + 1 == data.length) {
       cdt.offset += 1;
       if (errorList != null) {
         errorList.addError(
             ErrorHandler.MINOR_ERROR, "Escape character found at end of field, discarding it.");
       } else {
         throw new MarcException("Escape character found at end of field");
       }
       break;
     }
     switch (data[cdt.offset + 1 + extra]) {
       case 0x28: // '('
       case 0x2c: // ','
         set_cdt(cdt, 0, data, 2 + extra, false);
         break;
       case 0x29: // ')'
       case 0x2d: // '-'
         set_cdt(cdt, 1, data, 2 + extra, false);
         break;
       case 0x24: // '$'
         if (!loadedMultibyte) {
           loadMultibyte();
           loadedMultibyte = true;
         }
         switch (data[cdt.offset + 2 + extra + extra2]) {
           case 0x29: // ')'
           case 0x2d: // '-'
             set_cdt(cdt, 1, data, 3 + extra + extra2, true);
             break;
           case 0x2c: // ','
             set_cdt(cdt, 0, data, 3 + extra + extra2, true);
             break;
           case 0x31: // '1'
             cdt.g0 = data[cdt.offset + 2 + extra + extra2];
             cdt.offset += 3 + extra + extra2;
             cdt.multibyte = true;
             break;
           case 0x20: // ' '
             // space found in escape code: look ahead and try to proceed
             extra2++;
             break;
           default:
             // unknown code character found: discard escape sequence and return
             cdt.offset += 1;
             if (errorList != null) {
               errorList.addError(
                   ErrorHandler.MINOR_ERROR,
                   "Unknown character set code found following escape character. Discarding escape character.");
             } else {
               throw new MarcException(
                   "Unknown character set code found following escape character.");
             }
             break;
         }
         break;
       case 0x67: // 'g'
       case 0x62: // 'b'
       case 0x70: // 'p'
         cdt.g0 = data[cdt.offset + 1 + extra];
         cdt.offset += 2 + extra;
         cdt.multibyte = false;
         break;
       case 0x73: // 's'
         cdt.g0 = 0x42;
         cdt.offset += 2 + extra;
         cdt.multibyte = false;
         break;
       case 0x20: // ' '
         // space found in escape code: look ahead and try to proceed
         if (errorList == null) {
           throw new MarcException(
               "Extraneous space character found within MARC8 character set escape sequence");
         }
         extra++;
         break;
       default:
         // unknown code character found: discard escape sequence and return
         cdt.offset += 1;
         if (errorList != null) {
           errorList.addError(
               ErrorHandler.MINOR_ERROR,
               "Unknown character set code found following escape character. Discarding escape character.");
         } else {
           throw new MarcException("Unknown character set code found following escape character.");
         }
         break;
     }
   }
   if (errorList != null && (extra != 0 || extra2 != 0)) {
     errorList.addError(
         ErrorHandler.ERROR_TYPO,
         ""
             + (extra + extra2)
             + " extraneous space characters found within MARC8 character set escape sequence");
   }
 }