public void compute( final float[] scores, final float[] whiteObservations, final float[] blackObservations, final int sequenceLength) { int gridSizeX = (int) Math.ceil(((double) sequenceLength) / (blockSizeX * rollX)); int extendedSeqLength = gridSizeX * (blockSizeX * rollX); d_Ow.write( queue, pc.capture( Pointer.pointerToFloats( CudaUtil.extendWithZeros( whiteObservations, (extendedSeqLength + maxTemplateWidth - 1) * CharacterTemplate.LINE_HEIGHT))), false); d_Ob.write( queue, pc.capture( Pointer.pointerToFloats( CudaUtil.extendWithZeros( blackObservations, (extendedSeqLength + maxTemplateWidth - 1) * CharacterTemplate.LINE_HEIGHT))), false); queue.enqueueBarrier(); for (int tw = minTemplateWidth; tw <= maxTemplateWidth; ++tw) { if (templateNumIndices[tw - minTemplateWidth] > 0) { int gridSizeY = (int) Math.ceil(((double) templateNumIndices[tw - minTemplateWidth]) / blockSizeY); CLKernel computeKernel = program.createKernel("compute_emissions_" + tw); computeKernel.setArgs( templateIndicesOffsets[tw - minTemplateWidth] * sequenceLength, sequenceLength, templateNumIndices[tw - minTemplateWidth], d_Tw[tw - minTemplateWidth], d_Tb[tw - minTemplateWidth], d_Ow, d_Ob, d_scores); computeKernel.enqueueNDRange( queue, new int[] {gridSizeX * blockSizeX, gridSizeY * blockSizeY}, new int[] {blockSizeX, blockSizeY}); } } queue.enqueueBarrier(); d_scores.read(queue).getFloats(scores); }
public static synchronized boolean addVec( CLDevice.Type clType, int[] vecC, int[] vecA, int[] vecB) { try { if ((Logger.getLogMask() & Level.DEFAULT.DEBUG.getLevel().getValue()) == Level.DEFAULT.DEBUG.getLevel().getValue()) { Logger.logDebug(CLAZZ, "addVec() vecA: " + Convert.toString(vecA)); Logger.logDebug(CLAZZ, "addVec() vecB: " + Convert.toString(vecB)); } /** * Initialisiere OpenCL-Objekte ** */ initCL(clType); /** * Ausgabe von Informationen ueber gewaehltes OpenCL-Device ** */ Logger.logInfo(CLAZZ, "max compute units: " + devices.get(0).getMaxComputeUnits()); Logger.logInfo(CLAZZ, "max work group sizes: " + devices.get(0).getMaxWorkGroupSize()); Logger.logInfo( CLAZZ, "max global mem size (KB): " + devices.get(0).getGlobalMemSize() / 1024); Logger.logInfo(CLAZZ, "max local mem size (KB): " + devices.get(0).getLocalMemSize() / 1024); /** * Erstellen und Vorbereiten der Daten ** */ IntBuffer tmpBuffer = ByteBuffer.allocateDirect(vecA.length * Integer.SIZE) .order(context.getByteOrder()) .asIntBuffer(); tmpBuffer.put(vecA); CLBuffer<IntBuffer> aBuffer = context.createBuffer(CLMem.Usage.Input, tmpBuffer, true); tmpBuffer.clear(); tmpBuffer.put(vecB); CLBuffer<IntBuffer> bBuffer = context.createBuffer(CLMem.Usage.Input, tmpBuffer, true); CLBuffer<IntBuffer> cBuffer = context.createBuffer(CLMem.Usage.Output, vecC.length, IntBuffer.class); /** * Kernel-Argumente setzen ** */ kernel.setArg(0, cBuffer); kernel.setArg(1, aBuffer); kernel.setArg(2, bBuffer); kernel.setArg(3, vecC.length); /** * Kernel ausfuehren und auf Abarbeitung warten ** */ CLEvent event = kernel.enqueueNDRange(cmdQ, new int[] {vecC.length}, new CLEvent[0]); event.waitFor(); cmdQ.finish(); /** * Daten vom OpenCL-Device holen ** */ cBuffer.read(cmdQ, tmpBuffer, true, new CLEvent[0]); tmpBuffer.clear(); tmpBuffer.get(vecC); if ((Logger.getLogMask() & Level.DEFAULT.DEBUG.getLevel().getValue()) == Level.DEFAULT.DEBUG.getLevel().getValue()) { Logger.logDebug(CLAZZ, "addVec() vecC: " + Convert.toString(vecC)); } } catch (CLException err) { Logger.logError(CLAZZ, "OpenCL error:\n" + err.getMessage() + "():" + err.getCode()); err.printStackTrace(); return EXIT_FAILURE; } catch (Exception err) { Logger.logError(CLAZZ, "Error:\n" + err.getMessage() + "()"); err.printStackTrace(); return EXIT_FAILURE; } return EXIT_SUCCESS; }