/** * * 将指定的文件转换格式成发布的语料格式,并进行保存 首先获取文件夹路径,然后构建对应的txt,xml文件夹和对应的文件 * * @param filePath * @throws IOException * @throws DocumentException */ public void transformFile(String filePath) throws IOException, DocumentException { filePath = this.corpusRoot + filePath; // System.out.println(filePath); this.allFileNum++; // 原始语料位置和内容以及标注语料的原始存放路径名 File corpFile = new File(filePath); String rawPath = filePath.substring(0, filePath.lastIndexOf('.')) + ".txt"; String rawText = Toolkit.readFileToString(rawPath).replace("\r\n", "\n"); String fName = corpFile.getName(); String relativeDir = corpFile.getParentFile().getAbsolutePath(); relativeDir = relativeDir.substring(this.corpusRoot.length()); // 分发txt格式语料存放的文件夹和文件名 String disTxtDir = this.disTXTRoot + "\\" + relativeDir; String disTxtName = disTxtDir + "\\" + fName; String disXmlDir = this.disXMLRoot + "\\" + relativeDir; String disXmlName = disXmlDir + "\\" + fName; File txtDirFile = new File(disTxtDir); File txtFile = new File(disTxtName); File xmlDirFile = new File(disXmlDir); File xmlFile = new File(disXmlName); if (!txtDirFile.exists()) txtDirFile.mkdirs(); if (!txtFile.exists()) txtFile.createNewFile(); if (!xmlDirFile.exists()) xmlDirFile.mkdirs(); if (!xmlFile.exists()) xmlFile.createNewFile(); // 构建完成文件夹路径之后,将文件进行转换,获得转换后的结果 Vector<String> annotaLines = new Vector<String>(); Toolkit.readFileToLines(filePath, annotaLines); FileWriter txtWriter = new FileWriter(disTxtName); FileWriter xmlWriter = new FileWriter(disXmlName); String txtResult = null; String xmlResult = null; xmlWriter.write("<?xml version=\"1.0\" encoding=\"gb2312\" ?>\r\n"); xmlWriter.write("<doc>\r\n"); /** 已经处理过的标注行,主要用于在同一个标注文件中删除重复标注行* */ Vector<String> parsedLines = new Vector<String>(); // 将每一个标注行都进行修改 for (String line : annotaLines) { // 判断是否是重复行 if (parsedLines.contains(line)) continue; parsedLines.add(line); TransformOneLine temp = new TransformOneLine(filePath, rawText, line); boolean result = temp.run(); if (!result) continue; txtResult = temp.generateTXT(); xmlResult = temp.generateXML(); if (txtResult.length() > 1) { txtWriter.write(txtResult + "\r\n\r\n"); } if (xmlResult.length() > 1) { xmlWriter.write(xmlResult + "\r\n\r\n"); } } xmlWriter.write("</doc>"); txtWriter.close(); xmlWriter.close(); // 将txt raw语料原样拷贝过去 File srcTxtFile = new File(rawPath); File dstTxtFile = new File(disTxtDir + "\\" + srcTxtFile.getName()); File dstXmlFile = new File(disXmlDir + "\\" + srcTxtFile.getName()); if (!dstTxtFile.exists()) { dstTxtFile.createNewFile(); Toolkit.copyFile(srcTxtFile, dstTxtFile); } if (!dstXmlFile.exists()) { dstXmlFile.createNewFile(); Toolkit.copyFile(srcTxtFile, dstXmlFile); } }