public TmxReader(String tmxContent) throws TmxReadException { // 解析文件 VTDGen vg = new VTDGen(); vg.setDoc(tmxContent.getBytes()); String message = ""; try { vg.parse(true); } catch (EncodingException e) { logger.error(Messages.getString("document.ImportAbstract.logger1"), e); message = Messages.getString("document.ImportAbstract.msg1"); throw new TmxReadException(message + e.getMessage()); } catch (ParseException e) { logger.error(Messages.getString("document.ImportAbstract.logger3"), e); String errMsg = e.getMessage(); if (errMsg.indexOf("invalid encoding") != -1) { // 编码异常 message = Messages.getString("document.ImportAbstract.msg1"); } else { message = Messages.getString("document.ImportAbstract.msg3"); } throw new TmxReadException(message + e.getMessage()); } header = new TmxHeader(); validateTmxAndParseHeader(vg); tuAp = new AutoPilot(vu.getVTDNav()); try { tuAp.selectXPath("./tu"); } catch (XPathParseException e) { throw new TmxReadException(Messages.getString("document.TmxReader.parseTmxFileError")); } }
public TmxReader(File file) throws TmxReadException { // 解析文件 VTDGen vg = null; try { vg = VTDLoader.loadVTDGen(file, FileEncodingDetector.detectFileEncoding(file)); } catch (IOException e) { logger.error(Messages.getString("document.DocUtils.logger1"), e); throw new TmxReadException(Messages.getString("document.TmxReader.parseTmxFileError")); } catch (EncodingException e) { logger.error(Messages.getString("document.ImportAbstract.logger1"), e); String message = Messages.getString("document.ImportAbstract.msg1"); throw new TmxReadException(message + e.getMessage()); } catch (ParseException e) { logger.error(Messages.getString("document.ImportAbstract.logger3"), e); String errMsg = e.getMessage(); String message; if (errMsg.indexOf("invalid encoding") != -1) { // 编码异常 message = Messages.getString("document.ImportAbstract.msg1"); } else { message = Messages.getString("document.ImportAbstract.msg3"); } throw new TmxReadException(message + e.getMessage()); } catch (EmptyFileException e) { logger.error(Messages.getString("document.DocUtils.logger1"), e); throw new TmxReadException(Messages.getString("document.TmxReader.EmptyTmxFileError")); } if (vg == null) { throw new TmxReadException(Messages.getString("document.TmxReader.parseTmxFileError")); } // 验证TMX ,解析Header XMLElement,将节点导航到Body XMLElement header = new TmxHeader(); validateTmxAndParseHeader(vg); tuAp = new AutoPilot(vu.getVTDNav()); try { tuAp.selectXPath("./tu"); } catch (XPathParseException e) { throw new TmxReadException(Messages.getString("document.TmxReader.parseTmxFileError")); } }
/** * Parse file with VTD-XML * * @param file * @return * @throws TmxReadException All Exception come from VTDExcetpion; */ private VTDGen paseFile(File file) throws TmxReadException { String encoding = FileEncodingDetector.detectFileEncoding(file); VTDGen vg = new VTDGen(); FileInputStream fis = null; String message = ""; try { fis = new FileInputStream(file); byte[] bArr = new byte[(int) file.length()]; int offset = 0; int numRead = 0; int numOfBytes = 1048576; // I choose this value randomally, // any other (not too big) value also can be here. if (bArr.length - offset < numOfBytes) { numOfBytes = bArr.length - offset; } while (offset < bArr.length && (numRead = fis.read(bArr, offset, numOfBytes)) >= 0) { offset += numRead; if (bArr.length - offset < numOfBytes) { numOfBytes = bArr.length - offset; } } // clean invalid XML character byte[] _bArr = new byte[bArr.length]; int _bArrIndx = 0; int type = 0; if (encoding.equalsIgnoreCase("UTF-16LE") || encoding.equalsIgnoreCase("UTF-16BE")) { type = 1; } for (int i = 0; i < bArr.length; i++) { byte b = bArr[i]; if ((b >= type && b <= 8) || b == 11 || b == 12 || (b >= 14 && b <= 31)) { continue; } else if (b == 38 && i + 1 < bArr.length && bArr[i + 1] == 35 && i + 2 < bArr.length) { // &# List<Byte> entis = new ArrayList<Byte>(); entis.add((byte) 38); entis.add((byte) 35); int j = i + 2; if (bArr[j] == 120) { // x entis.add((byte) 120); while (true) { j++; if (j >= bArr.length) { entis.clear(); b = bArr[i]; break; } b = bArr[j]; if ((b >= 48 && b <= 57) || (b >= 97 && b <= 102) || (b >= 65 && b <= 70)) { entis.add(b); } else if (b == 59) { entis.add(b); i = j; break; } else if (j - i > 10) { entis.clear(); b = bArr[i]; break; } else { entis.clear(); b = bArr[i]; break; } } } else { while (true) { b = bArr[j]; if ((b >= 48 && b <= 57)) { entis.add(b); } else if (b == 59) { entis.add(b); i = j; break; } else if (j - i > 10) { entis.clear(); b = bArr[i]; break; } else { entis.clear(); b = bArr[i]; break; } j++; if (j >= bArr.length) { entis.clear(); b = bArr[i]; break; } } } if (!entis.isEmpty()) { byte[] t = new byte[entis.size()]; for (int ti = 0; ti < entis.size(); ti++) { t[ti] = entis.get(ti); } String s = new String(t); if (s.matches("((&#[x]?)(([0]?([0-8]|[BbCcEe]))|(1[0-9])|(1[a-fA-F]));)")) { continue; } } } _bArr[_bArrIndx++] = b; } bArr = null; bArr = Arrays.copyOf(_bArr, _bArrIndx); // use vtd parse vg.setDoc(bArr); vg.parse(true); } catch (IOException e) { logger.error(Messages.getString("document.DocUtils.logger1"), e); throw new TmxReadException(Messages.getString("document.TmxReader.parseTmxFileError")); } catch (EncodingException e) { logger.error(Messages.getString("document.ImportAbstract.logger1"), e); message = Messages.getString("document.ImportAbstract.msg1"); throw new TmxReadException(message + e.getMessage()); } catch (ParseException e) { logger.error(Messages.getString("document.ImportAbstract.logger3"), e); String errMsg = e.getMessage(); if (errMsg.indexOf("invalid encoding") != -1) { // 编码异常 message = Messages.getString("document.ImportAbstract.msg1"); } else { message = Messages.getString("document.ImportAbstract.msg3"); } throw new TmxReadException(message + e.getMessage()); } finally { if (fis != null) { try { fis.close(); } catch (Exception e) { } } } return vg; }