public static void parseArgs(String[] args) throws Exception { int i = 0; String srcDirPath = new String(); // String srcDirPath="C:/Users/ql29/Documents/EClipse/Dataset/swt-3.1"; String dstDirPath = new String(); // String dstDirPath="C:/Users/ql29/Documents/EClipse/sourceCodeCorpus_new"; while (i < args.length - 1) { if (args[i].equals("-i")) { i++; srcDirPath = args[i]; } else if (args[i].equals("-o")) { i++; dstDirPath = args[i]; } else if (args[i].equals("-t")) { i++; if (args[i] != null) { Config.getInstance().setFileType(args[i + 1]); } } else if (args[i].equals("-l")) { i++; if (args[i] != null) { Config.getInstance().setSegmentationLength(Integer.parseInt(args[i])); } } i++; } boolean isLegal = true; if (!new File(srcDirPath).isDirectory()) { isLegal = false; System.out.println("Error--the input directory is illegal!\n"); } if (dstDirPath.equals(new String())) { System.out.println("please assign a directory for the source code corpus!"); isLegal = false; } if (!isLegal) { showHelp(); } else { SourceCodeCorpus corpus = extractCodeData(); Config.getInstance().setPaths(srcDirPath, null, dstDirPath, null); System.out.println("corpus extraction successful!"); exportCodeData(corpus); System.out.println("corpus successfully exported!"); } }
/** * Import the source code corpus from the given directory * * @param srcDirPath * @return * @throws Exception */ public static SourceCodeCorpus importCodeData() throws Exception { String srcDirPath = Config.getInstance().getCodeCorpusDir(); SourceCodeCorpus corpus = new SourceCodeCorpus(); File srcDir = new File(srcDirPath); if (!srcDir.isDirectory()) { System.out.println("The input directory path is invalid!"); return corpus; } // read the basic information from the "/basicInfo" String basicInfoFilePath = Paths.get(srcDirPath, "basicInfo").toString(); if (!new File(basicInfoFilePath).isFile()) { System.out.println("The file recording the basic information is missing!"); } else { BufferedReader basicInfoReader = new BufferedReader(new FileReader(basicInfoFilePath)); // read the first line: fileType=\tfileType String[] strs = basicInfoReader.readLine().split("\t"); String fileType = strs[1].trim(); // read the second line: segmentationLength=\tsegmentationLength strs = basicInfoReader.readLine().split("\t"); int segmentationLength = Integer.parseInt(strs[1].trim()); corpus.setFileType(fileType); corpus.setSegmentationLength(segmentationLength); basicInfoReader.close(); } // read the code content information from the "/codeContentCorpus" String codeContentDirPath = Paths.get(srcDirPath, "codeContentCorpus").toString(); File codeContentDir = new File(codeContentDirPath); if (!codeContentDir.isDirectory()) { System.out.println("The directory for original code contents is missing!"); } else { for (File oneFile : codeContentDir.listFiles()) { if (oneFile.isFile()) { String fullClassName = oneFile.getName(); BufferedReader reader = new BufferedReader(new FileReader(oneFile)); String codeContent = reader.readLine(); // If exists the full class name, set the code content; // else create a new SourceCode object and set the code consent boolean isExist = false; for (SourceCode sourceCode : corpus.getSourceCodeList()) { if (sourceCode.getFullClassName().equals(fullClassName.trim())) { sourceCode.setContent(codeContent); isExist = true; break; } } if (!isExist) { SourceCode newSourceCode = new SourceCode(); newSourceCode.setFullClassName(fullClassName); newSourceCode.setContent(codeContent); corpus.addSourceCode(newSourceCode); } reader.close(); } } } // read the code segment information from code segment corpus String codeSegmentDirPath = Paths.get(srcDirPath, "codeSegmentCorpus").toString(); File codeSegmentDir = new File(codeSegmentDirPath); if (!codeSegmentDir.isDirectory()) { System.out.println("The directory for code segments is missing!"); } else { for (File oneFile : codeContentDir.listFiles()) { if (oneFile.isFile()) { // fullClassName+@+"segment index".java String oneSegmentName = oneFile.getName(); BufferedReader reader = new BufferedReader(new FileReader(oneFile)); String codeSegment = reader.readLine(); String[] strs = oneSegmentName.split("@"); String fullClassName = strs[0].trim(); // If exists the full class name, add the segment to the list; // else create a new SourceCode object and add the segment to the list boolean isExist = false; for (SourceCode sourceCode : corpus.getSourceCodeList()) { if (sourceCode.getFullClassName().equals(fullClassName.trim())) { if (!sourceCode.getCodeSegmentList().contains(codeSegment)) { sourceCode.addCodeSegment(codeSegment); } isExist = true; break; } } if (!isExist) { SourceCode newSourceCode = new SourceCode(); newSourceCode.setFullClassName(fullClassName); newSourceCode.addCodeSegment(codeSegment); corpus.addSourceCode(newSourceCode); } reader.close(); } } } // read the class names information from the "/classNameCorpus" String classNameDirPath = Paths.get(srcDirPath, "classNameCorpus").toString(); File classNameDir = new File(classNameDirPath); if (!classNameDir.isDirectory()) { System.out.println("The directory for class names is missing!"); } else { for (File oneFile : classNameDir.listFiles()) { if (oneFile.isFile()) { String fullClassName = oneFile.getName(); BufferedReader reader = new BufferedReader(new FileReader(oneFile)); String classNamesString = reader.readLine(); // If exists the full class name, add the class names to the list; // else create a new SourceCode object and add all the class names to the list boolean isExist = false; for (SourceCode sourceCode : corpus.getSourceCodeList()) { if (sourceCode.getFullClassName().equals(fullClassName.trim())) { if (classNamesString == null) { isExist = true; break; } for (String oneClassName : classNamesString.split(" ")) { if (!sourceCode.getClassNameList().contains(oneClassName.trim())) { sourceCode.addClassName(oneClassName.trim()); } } isExist = true; break; } } if (!isExist) { SourceCode newSourceCode = new SourceCode(); newSourceCode.setFullClassName(fullClassName); if (classNamesString != null) { for (String oneClassName : classNamesString.split(" ")) { newSourceCode.addClassName(oneClassName.trim()); } } corpus.addSourceCode(newSourceCode); } reader.close(); } } } // read the method names information from the "/methodNameCorpus" String methodNameDirPath = Paths.get(srcDirPath, "methodNameCorpus").toString(); File methodNameDir = new File(methodNameDirPath); if (!methodNameDir.isDirectory()) { System.out.println("The directory for method names is missing!"); } else { for (File oneFile : methodNameDir.listFiles()) { if (oneFile.isFile()) { String fullClassName = oneFile.getName(); BufferedReader reader = new BufferedReader(new FileReader(oneFile)); String methodNamesString = reader.readLine(); // If exists the full class name, add the class names to the list; // else create a new SourceCode object and add all the method names to the list boolean isExist = false; for (SourceCode sourceCode : corpus.getSourceCodeList()) { if (sourceCode.getFullClassName().equals(fullClassName.trim())) { if (methodNamesString == null) { isExist = true; break; } for (String oneMethodName : methodNamesString.split(" ")) { if (!sourceCode.getMethodNameList().contains(oneMethodName.trim())) sourceCode.addMethodName(oneMethodName.trim()); } isExist = true; break; } } if (!isExist) { SourceCode newSourceCode = new SourceCode(); newSourceCode.setFullClassName(fullClassName); if (methodNamesString != null) { for (String oneMethodName : methodNamesString.split(" ")) { newSourceCode.addMethodName(oneMethodName.trim()); } } corpus.addSourceCode(newSourceCode); } reader.close(); } } } return corpus; }
/** * Extract the code data from the project * * @param srcDirPath * @param fileType * @return * @throws Exception */ public static SourceCodeCorpus extractCodeData() throws Exception { String srcDirPath = Config.getInstance().getDatasetDir(); String fileType = Config.getInstance().getFileType(); int segmentationLength = Config.getInstance().getSegmentationLength(); SourceCodeCorpus corpus = new SourceCodeCorpus(segmentationLength, fileType); File srcDir = new File(srcDirPath); if (!srcDir.isDirectory()) { System.out.println("The input directory path is invalid"); return corpus; } ArrayList<String> fileList = new ArrayList<String>(); detectAllFiles(srcDirPath, fileType, fileList); for (String oneFilePath : fileList) { SourceCode oneCodeFile = new SourceCode(); FileParser parser = new FileParser(oneFilePath); // set the full class name(package+fileName) String packageName = parser.getPackageName(); String fullClassName = new String(); if (packageName.trim().equals("")) { // no package, the name only fullClassName = new File(oneFilePath).getName(); } else { // full class name = package name + file name fullClassName = packageName + "." + new File(oneFilePath).getName(); } oneCodeFile.setFullClassName(fullClassName); // set the file content String[] terms = parser.getContent(); String fileContent = new String(); for (String term : terms) { String stemmedTerm = Stem.stem(term.toLowerCase()); // term = term.toLowerCase(); if (!(Stopword.isKeyword(term) || Stopword.isEnglishStopword(term))) { fileContent += stemmedTerm + " "; } } // append the class and method names in a file to the end of a file String[] classAndMethodNameString = parser.getClassNameAndMethodName(); for (String term : classAndMethodNameString) { fileContent += Stem.stem(term.toLowerCase()) + " "; } oneCodeFile.setContent(fileContent); // set the class names in the file String classNamesString = parser.getAllClassName(); String[] classNameArray = classNamesString.split(" "); for (String oneClassName : classNameArray) { oneCodeFile.addClassName(oneClassName); } // set the method names in the file String methodNamesString = parser.getAllMethodName(); String[] methodNameArray = methodNamesString.split(" "); for (String oneMethodName : methodNameArray) { oneCodeFile.addMethodName(oneMethodName); } // add the source code file information to the corpus corpus.addSourceCode(oneCodeFile); } // set the original code file count Config.getInstance().setFileCount(corpus.getSourceCodeList().size()); // segment each source code file corpus.segment(); return corpus; }
/** * Export the code data to the given directory * * @param dstDirPath * @param corpus * @throws IOException */ public static void exportCodeData(SourceCodeCorpus corpus) throws IOException { String dstDirPath = Config.getInstance().getCodeCorpusDir(); // create a directory File dstDir = new File(dstDirPath); if (!dstDir.isDirectory()) { dstDir.mkdir(); } // record the basic information in the "/basicInfo" String basicInfoFilePath = Paths.get(dstDirPath, "basicInfo").toString(); FileWriter basicInfoWriter = new FileWriter(basicInfoFilePath); // fileType + segmentationLength String basicInfoStr = "fileType=" + "\t" + corpus.getFileType() + "\r\n" + "segmentationLength=" + "\t" + corpus.getSegmentationLength(); basicInfoWriter.write(basicInfoStr); basicInfoWriter.close(); // corpus built by original code contents String codeContentDirPath = Paths.get(dstDirPath, "codeContentCorpus").toString(); File codeContentDir = new File(codeContentDirPath); if (!codeContentDir.isDirectory()) { codeContentDir.mkdir(); } // corpus built by code segments String codeSegmentDirPath = Paths.get(dstDirPath, "codeSegmentCorpus").toString(); File codeSegmentDir = new File(codeSegmentDirPath); if (!codeSegmentDir.isDirectory()) { codeSegmentDir.mkdir(); } // corpus built by class name list String classNameDirPath = Paths.get(dstDirPath, "classNameCorpus").toString(); File classNameDir = new File(classNameDirPath); if (!classNameDir.isDirectory()) { classNameDir.mkdir(); } // corpus built by method name list String methodNameDirPath = Paths.get(dstDirPath, "methodNameCorpus").toString(); File methodNameDir = new File(methodNameDirPath); if (!methodNameDir.isDirectory()) { methodNameDir.mkdir(); } // Traverse every file in the source code file list for (SourceCode oneCodeFile : corpus.getSourceCodeList()) { String fileName; FileWriter writer; // For each source code file, the name in the original content corpus is the full class name fileName = Paths.get(codeContentDirPath, oneCodeFile.getFullClassName()).toString(); writer = new FileWriter(fileName); writer.write(oneCodeFile.getContent()); writer.close(); // For each source code file, the name in the code segments corpus is the full class // name+@+"the segment index".java String[] codeSegmentArray = oneCodeFile.getCodeSegmentList().toArray(new String[0]); for (int i = 0; i < codeSegmentArray.length; i++) { fileName = Paths.get(codeSegmentDirPath, oneCodeFile.getFullClassName() + "@" + i + ".java") .toString(); writer = new FileWriter(fileName); writer.write(codeSegmentArray[i]); writer.close(); } // For each source code file, the name in the class Names corpus is the full class name fileName = Paths.get(classNameDirPath, oneCodeFile.getFullClassName()).toString(); writer = new FileWriter(fileName); String classNamesString = new String(); for (String oneClassName : oneCodeFile.getClassNameList()) { classNamesString += oneClassName + " "; } writer.write(classNamesString.trim()); writer.close(); // For each source code file, the name in the method Names corpus is the full class name fileName = Paths.get(methodNameDirPath, oneCodeFile.getFullClassName()).toString(); writer = new FileWriter(fileName); String methodNamesString = new String(); for (String oneMethodName : oneCodeFile.getMethodNameList()) { methodNamesString += oneMethodName + " "; } writer.write(methodNamesString.trim()); writer.close(); } }