@SuppressWarnings("unchecked") public void startup( float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) { this.templateNumIndices = templateNumIndices; this.templateIndicesOffsets = templateIndicesOffsets; this.maxTemplateWidth = maxTemplateWidth; this.minTemplateWidth = minTemplateWidth; // Allocate the device input data int extendedMaxSeqLength = (blockSizeX * rollX) * (int) Math.ceil(((double) maxSequenceLength) / (blockSizeX * rollX)); this.d_Ow = context.createFloatBuffer( Usage.Input, (extendedMaxSeqLength + maxTemplateWidth - 1) * CharacterTemplate.LINE_HEIGHT); this.d_Ob = context.createFloatBuffer( Usage.Input, (extendedMaxSeqLength + maxTemplateWidth - 1) * CharacterTemplate.LINE_HEIGHT); this.d_scores = context.createFloatBuffer(Usage.Output, maxSequenceLength * totalTemplateNumIndices); int numTemplateWidths = (maxTemplateWidth - minTemplateWidth) + 1; this.d_Tw = new CLBuffer[numTemplateWidths]; this.d_Tb = new CLBuffer[numTemplateWidths]; for (int tw = minTemplateWidth; tw <= maxTemplateWidth; ++tw) { if (templateNumIndices[tw - minTemplateWidth] > 0) { d_Tw[tw - minTemplateWidth] = context.createFloatBuffer(Usage.Input, whiteTemplates[tw - minTemplateWidth].length); d_Tw[tw - minTemplateWidth].write( queue, pc.capture(Pointer.pointerToFloats(whiteTemplates[tw - minTemplateWidth])), false); d_Tb[tw - minTemplateWidth] = context.createFloatBuffer(Usage.Input, whiteTemplates[tw - minTemplateWidth].length); d_Tb[tw - minTemplateWidth].write( queue, pc.capture(Pointer.pointerToFloats(blackTemplates[tw - minTemplateWidth])), false); } } }
private static void initCL(CLDevice.Type clType) throws Exception { /** * Hole OpenCL-Plattformen z.B. AMD APP, NVIDIA CUDA ** */ platforms = JavaCL.listPlatforms(); /** * Hole OpenCL-Device des geforderten Typs z.B. GPU, CPU ** */ EnumSet<CLDevice.Type> types = EnumSet.of(clType); devices = new ArrayList<CLDevice>(); CLDevice[] devTmp; for (CLPlatform platform : platforms) { devTmp = platform.listDevices(types, true); devices.addAll(Arrays.asList(devTmp)); } /** * Erstelle OpenCL-Context und CommandQueue ** */ devTmp = new CLDevice[devices.size()]; context = JavaCL.createContext(null, devices.toArray(devTmp)); cmdQ = context.createDefaultQueue(QueueProperties.ProfilingEnable); /** * OpenCL-Quellcode einlesen ** */ String src = readFile(KERNEL_PATH); // String src = KERNEL_SRC; /** * OpenCL-Programm aus Quellcode erstellen ** */ program = context.createProgram(src); try { program.build(); } catch (CLBuildException err) { Logger.logError(CLAZZ, "Build log for \"" + devices.get(0) + "\n" + err.getMessage()); throw err; } /** * OpenCL-Kernel laden ** */ kernel = program.createKernel("addVec"); }
public static synchronized boolean addVec( CLDevice.Type clType, int[] vecC, int[] vecA, int[] vecB) { try { if ((Logger.getLogMask() & Level.DEFAULT.DEBUG.getLevel().getValue()) == Level.DEFAULT.DEBUG.getLevel().getValue()) { Logger.logDebug(CLAZZ, "addVec() vecA: " + Convert.toString(vecA)); Logger.logDebug(CLAZZ, "addVec() vecB: " + Convert.toString(vecB)); } /** * Initialisiere OpenCL-Objekte ** */ initCL(clType); /** * Ausgabe von Informationen ueber gewaehltes OpenCL-Device ** */ Logger.logInfo(CLAZZ, "max compute units: " + devices.get(0).getMaxComputeUnits()); Logger.logInfo(CLAZZ, "max work group sizes: " + devices.get(0).getMaxWorkGroupSize()); Logger.logInfo( CLAZZ, "max global mem size (KB): " + devices.get(0).getGlobalMemSize() / 1024); Logger.logInfo(CLAZZ, "max local mem size (KB): " + devices.get(0).getLocalMemSize() / 1024); /** * Erstellen und Vorbereiten der Daten ** */ IntBuffer tmpBuffer = ByteBuffer.allocateDirect(vecA.length * Integer.SIZE) .order(context.getByteOrder()) .asIntBuffer(); tmpBuffer.put(vecA); CLBuffer<IntBuffer> aBuffer = context.createBuffer(CLMem.Usage.Input, tmpBuffer, true); tmpBuffer.clear(); tmpBuffer.put(vecB); CLBuffer<IntBuffer> bBuffer = context.createBuffer(CLMem.Usage.Input, tmpBuffer, true); CLBuffer<IntBuffer> cBuffer = context.createBuffer(CLMem.Usage.Output, vecC.length, IntBuffer.class); /** * Kernel-Argumente setzen ** */ kernel.setArg(0, cBuffer); kernel.setArg(1, aBuffer); kernel.setArg(2, bBuffer); kernel.setArg(3, vecC.length); /** * Kernel ausfuehren und auf Abarbeitung warten ** */ CLEvent event = kernel.enqueueNDRange(cmdQ, new int[] {vecC.length}, new CLEvent[0]); event.waitFor(); cmdQ.finish(); /** * Daten vom OpenCL-Device holen ** */ cBuffer.read(cmdQ, tmpBuffer, true, new CLEvent[0]); tmpBuffer.clear(); tmpBuffer.get(vecC); if ((Logger.getLogMask() & Level.DEFAULT.DEBUG.getLevel().getValue()) == Level.DEFAULT.DEBUG.getLevel().getValue()) { Logger.logDebug(CLAZZ, "addVec() vecC: " + Convert.toString(vecC)); } } catch (CLException err) { Logger.logError(CLAZZ, "OpenCL error:\n" + err.getMessage() + "():" + err.getCode()); err.printStackTrace(); return EXIT_FAILURE; } catch (Exception err) { Logger.logError(CLAZZ, "Error:\n" + err.getMessage() + "()"); err.printStackTrace(); return EXIT_FAILURE; } return EXIT_SUCCESS; }
public OpenCLInnerLoop(int numThreads) { // choose device List<CLDevice> devices = new ArrayList<CLDevice>(); System.out.println(); for (CLPlatform platform : JavaCL.listPlatforms()) { for (CLDevice device : platform.listAllDevices(true)) { System.out.println("Type: " + device.getType()); System.out.println("Vendor: " + device.getVendor()); System.out.println("Name: " + device.getName()); System.out.println("Compute units: " + device.getMaxComputeUnits()); System.out.println("Global mem: " + device.getGlobalMemSize() / 1e6 + "MB"); System.out.println("Driver version: " + device.getDriverVersion()); System.out.println(); devices.add(device); } } if (context == null) { for (CLDevice device : devices) { if (device.getVendor().toLowerCase().contains("intel") && device.getType().contains(CLDevice.Type.GPU) && device.getMaxComputeUnits() >= 140 && device.getGlobalMemSize() > 512e6) { this.context = JavaCL.createContext(null, device); } } } // if (context == null) { // for (CLDevice device : devices) { // if (device.getVendor().toLowerCase().contains("nvidia") && // device.getType().contains(CLDevice.Type.GPU) && device.getMaxComputeUnits() >= 8 && // device.getGlobalMemSize() > 1e9 && // !device.getPlatform().getName().toLowerCase().contains("apple")) { // this.context = JavaCL.createContext(null, device); // } // } // } if (context == null) { this.context = JavaCL.createBestContext(DeviceFeature.CPU); } if (context.getDevices()[0].getType().contains(CLDevice.Type.GPU) && context.getDevices()[0].getVendor().toLowerCase().contains("nvidia")) { this.blockSizeX = NVIDIA_GPU_BLOCK_SIZE_X; this.rollX = NVIDIA_GPU_ROLL_X; this.blockSizeY = NVIDIA_GPU_BLOCK_SIZE_Y; } else if (context.getDevices()[0].getType().contains(CLDevice.Type.GPU) && context.getDevices()[0].getVendor().toLowerCase().contains("intel")) { this.blockSizeX = INTEL_GPU_BLOCK_SIZE_X; this.rollX = INTEL_GPU_ROLL_X; this.blockSizeY = INTEL_GPU_BLOCK_SIZE_Y; } else if (context.getDevices()[0].getType().contains(CLDevice.Type.CPU)) { this.blockSizeX = CPU_BLOCK_SIZE_X; this.rollX = CPU_ROLL_X; this.blockSizeY = CPU_BLOCK_SIZE_Y; } System.out.println("Using context:"); System.out.println(context.toString()); System.out.println("Block size x: " + blockSizeX); System.out.println("Roll x: " + rollX); System.out.println("Block size y: " + blockSizeY); this.context.setCacheBinaries(false); this.queue = context.createDefaultQueue(); this.program = context.createProgram(kernelSrc()); this.program.addBuildOption("-cl-fast-relaxed-math"); this.program.addBuildOption("-cl-mad-enable"); this.program.addBuildOption("-cl-unsafe-math-optimizations"); this.program.addBuildOption("-cl-fast-relaxed-math"); this.program.addBuildOption("-cl-single-precision-constant"); this.program.build(); this.pc = new PointerCapturer(); this.numThreads = numThreads; }