private void processText( String text, int ngramSize, String resultFolder, String sourceFile, String author) { List<List<String>> ngrams = extractor.extract_SN_Grams(text, ngramSize); List<List<Integer>> numericNGrams = FeatureMapper.numericalizeNGrams(ngrams); HashMap<List<Integer>, Integer> profile = FeatureMapper.createProfile(numericNGrams); writeProfile(profile, resultFolder, sourceFile, author); // Store the seen ngrams for (List<Integer> seenNgram : profile.keySet()) { uniqueNgrams.add(seenNgram); } }
private static double calculateMass( FeatureMapper polypeptideMapper, SymbolList residuesSymbolList) { try { double massInDaltons = MassCalc.getMass(residuesSymbolList, SymbolPropertyTable.AVG_MASS, true); return massInDaltons; } catch (Exception exp) { logger.error( String.format( "Error computing protein mass in '%s' because '%s'", polypeptideMapper.getUniqueName(), exp.getMessage())); } return -1.0; }
public void processCorpus(String corpusFolder, String resultFolder, int ngramSize) { CorpusManager corpusManager = null; try { corpusManager = new CorpusManager(corpusFolder); } catch (IOException e) { System.err.println("Could not initialize corpus manager: " + e.getMessage()); System.exit(-1); } TextInstance currentTextInstance = corpusManager.getNextText(); int textCount = 1; while (currentTextInstance != null) { String currentText = null; try { currentText = currentTextInstance.getFullText(); } catch (IOException e) { System.err.println( "Could not get text. Skipping " + currentTextInstance.getTextSource() + " Cause: " + e.getMessage()); currentTextInstance = corpusManager.getNextText(); continue; } System.out.print( "Processing text " + textCount + " of " + corpusManager.getTextCount() + " (" + currentTextInstance.getTextSource().getAbsolutePath().toString() + ") ... "); try { processText( currentText, ngramSize, resultFolder, currentTextInstance.getTextSource().getName(), currentTextInstance.getTrueAuthor()); System.out.println("Done."); } catch (Exception e) { System.err.println(" ERROR: " + e.getMessage()); // just continue to next text } currentTextInstance = corpusManager.getNextText(); ++textCount; } // Do the same with the unknown texts // TODO: Fix duplication! int unknownTextCount = 1; TextInstance currentUnknownText = corpusManager.getUnknownText(); while (currentUnknownText != null) { System.out.println( "Processing unknown text " + unknownTextCount + " of " + corpusManager.getUnknownTextCount()); String unknownText = null; try { unknownText = currentUnknownText.getFullText(); } catch (IOException e) { System.err.println( "Could not get unknown text. Skipping " + currentUnknownText.getTextSource() + " Cause: " + e.getMessage()); currentUnknownText = corpusManager.getUnknownText(); continue; } try { processText( unknownText, ngramSize, resultFolder, currentUnknownText.getTextSource().getName(), "UNKNOWN"); } catch (Exception e) { System.err.println(" ERROR: " + e.getMessage()); // just continue to next text } currentUnknownText = corpusManager.getUnknownText(); ++unknownTextCount; } ProfileWriter.writeTranslationTable( FeatureMapper.getTranslationTable(), new File(resultFolder)); ProfileWriter.writeSeenUniqueNGrams(uniqueNgrams, new File(resultFolder)); }
/** * Calculate the predicted properties of this polypeptide. * * @return a <code>PeptideProperties</code> object containing the predicted properties of this * polypeptide. */ public static PeptideProperties calculateStats(FeatureMapper polypeptideMapper) { if (polypeptideMapper.getResidues() == null) { logger.warn("No residues for '" + polypeptideMapper.getUniqueName() + "'"); return null; } String residuesString = new String(polypeptideMapper.getResidues()); SymbolList residuesSymbolList = null; PeptideProperties pp = new PeptideProperties(); try { SymbolTokenization proteinTokenization = ProteinTools.getTAlphabet().getTokenization("token"); residuesSymbolList = new SimpleSymbolList(proteinTokenization, residuesString); if (residuesSymbolList.length() == 0) { logger.error( String.format( "Polypeptide feature '%s' has zero-length residues", polypeptideMapper.getUniqueName())); return pp; } try { // if the sequence ends with a termination symbol (*), we need to remove it if (residuesSymbolList.symbolAt(residuesSymbolList.length()) == ProteinTools.ter()) { if (residuesSymbolList.length() == 1) { logger.error( String.format( "Polypeptide feature '%s' only has termination symbol", polypeptideMapper.getUniqueName())); return pp; } residuesSymbolList = residuesSymbolList.subList(1, residuesSymbolList.length() - 1); } } catch (IndexOutOfBoundsException exception) { throw new RuntimeException(exception); } } catch (BioException e) { logger.error("Can't translate into a protein sequence", e); return pp; } pp.setAminoAcids(residuesSymbolList.length()); try { double isoElectricPoint = new IsoelectricPointCalc().getPI(residuesSymbolList, false, false); pp.setIsoelectricPoint(isoElectricPoint); } catch (Exception e) { logger.error( String.format("Error computing protein isoelectric point for '%s'", residuesSymbolList), e); } double mass2 = calculateMass(polypeptideMapper, residuesSymbolList); if (mass2 != -1) { // mass = mass2; pp.setMass(mass2); } double charge = calculateCharge(residuesString); pp.setCharge(charge); return pp; }