private static void extractAndLoadNativeLibs() throws IOException { Path target = Paths.get(ioTmpDir, "/tklib"); if (!target.toFile().exists()) { Files.createDirectories(target); } final boolean windows = System.getProperty("os.name").equalsIgnoreCase("windows"); String fileExtension = windows ? "dll" : "so"; String prefix = windows ? "" : "lib"; String libPattern = fileExtension.equals("dll") ? "-windows" : "-linux" + "-x86"; if (System.getProperty("sun.arch.data.model").equals("64")) { libPattern += "_64"; } libPattern += "." + fileExtension; // System.err.println(libPattern); System.setProperty("java.library.path", target.toString()); // System.err.println(System.getProperty("java.library.path")); extractAndLoadNativeLib(prefix + "JCudaDriver" + libPattern, target); extractAndLoadNativeLib(prefix + "JCudaRuntime" + libPattern, target); extractAndLoadNativeLib(prefix + "JCurand" + libPattern, target); }
private static void extractAndLoadNativeLib(String nativeLibName, Path target) { // System.err.println("loading "+nativeLibName); final Path path = Paths.get(target.toString(), nativeLibName); if (!path.toFile().exists()) { try (InputStream is = CudaEngine.class .getClassLoader() .getResourceAsStream("/lib/" + nativeLibName)) { // TODO TK property for lib dir Files.copy(is, path); } catch (IOException e) { e.printStackTrace(); } catch (NullPointerException e) { // TODO find a way to do it instead of eclipse final Path eclipsePath = FileSystems.getDefault().getPath("lib", nativeLibName); try { Files.copy(eclipsePath, path); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } } System.load(path.toString()); // System.load(nativeLibName); }
public class CudaEngine { static final Map<Integer, CudaEngine> cudaEngines = new HashMap<>(); private static final String PHEROMONES_CU = "pheromones"; private static final String ioTmpDir = System.getProperty("java.io.tmpdir"); private static int availableDevicesNb = 0; private static ExecutorService initialization; private static int NB_OF_DEVICE_TO_USE = 1; private static AtomicInteger cudaObjectID = new AtomicInteger(0); final HashMap<Kernel, CUfunction> kernels = new HashMap<Kernel, CUfunction>(); protected CUfunction f; protected CUcontext context; private ExecutorService exe; private List<CudaObject> cudaObjects = new ArrayList<CudaObject>(); private int maxThreads; private int Id = -1; private Map<String, CUdeviceptr> neigborsPtrs; private CudaEngine(final int deviceId) { exe = Executors.newSingleThreadExecutor(); // mandatory: Only one cuda thread per context Id = deviceId; try { exe.submit( new Runnable() { @Override public void run() { CUdevice device = new CUdevice(); JCudaDriver.cuDeviceGet(device, deviceId); int array[] = {0}; JCudaDriver.cuDeviceGetAttribute( array, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); maxThreads = (int) Math.sqrt(array[0]); context = new CUcontext(); // JCudaDriver.cuCtxCreate(context, CUctx_flags.CU_CTX_SCHED_BLOCKING_SYNC, // device); JCudaDriver.cuCtxCreate(context, 0, device); CUmodule m = new CUmodule(); initModules(m); for (Kernel k : Kernel.values()) { initFunction(m, k); } // JCudaDriver.cuCtxSetCacheConfig(CUfunc_cache.CU_FUNC_CACHE_PREFER_NONE);> // // JCudaDriver.cuCtxSetSharedMemConfig(CUsharedconfig.CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE); } }) .get(); } catch (InterruptedException | ExecutionException e) { throw new RuntimeException(e.getMessage()); } neigborsPtrs = new HashMap<>(); } private static void extractAndLoadNativeLib(String nativeLibName, Path target) { // System.err.println("loading "+nativeLibName); final Path path = Paths.get(target.toString(), nativeLibName); if (!path.toFile().exists()) { try (InputStream is = CudaEngine.class .getClassLoader() .getResourceAsStream("/lib/" + nativeLibName)) { // TODO TK property for lib dir Files.copy(is, path); } catch (IOException e) { e.printStackTrace(); } catch (NullPointerException e) { // TODO find a way to do it instead of eclipse final Path eclipsePath = FileSystems.getDefault().getPath("lib", nativeLibName); try { Files.copy(eclipsePath, path); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } } System.load(path.toString()); // System.load(nativeLibName); } private static void extractAndLoadNativeLibs() throws IOException { Path target = Paths.get(ioTmpDir, "/tklib"); if (!target.toFile().exists()) { Files.createDirectories(target); } final boolean windows = System.getProperty("os.name").equalsIgnoreCase("windows"); String fileExtension = windows ? "dll" : "so"; String prefix = windows ? "" : "lib"; String libPattern = fileExtension.equals("dll") ? "-windows" : "-linux" + "-x86"; if (System.getProperty("sun.arch.data.model").equals("64")) { libPattern += "_64"; } libPattern += "." + fileExtension; // System.err.println(libPattern); System.setProperty("java.library.path", target.toString()); // System.err.println(System.getProperty("java.library.path")); extractAndLoadNativeLib(prefix + "JCudaDriver" + libPattern, target); extractAndLoadNativeLib(prefix + "JCudaRuntime" + libPattern, target); extractAndLoadNativeLib(prefix + "JCurand" + libPattern, target); } public static void main(String[] args) { init(); } /** */ public static boolean init() { synchronized (cudaEngines) { System.err.println("---------Initializing Cuda----------------"); try { extractAndLoadNativeLibs(); JCudaDriver.setExceptionsEnabled(true); JCudaDriver.cuInit(0); compileKernelsPtx(); // Obtain the number of devices int deviceCountArray[] = {0}; JCudaDriver.cuDeviceGetCount(deviceCountArray); availableDevicesNb = deviceCountArray[0]; if (availableDevicesNb == 0) return false; availableDevicesNb = NB_OF_DEVICE_TO_USE; // TODO initialization = Executors.newCachedThreadPool(); System.out.println("Found " + availableDevicesNb + " GPU devices"); for (int i = 0 /*-NB_OF_DEVICE_TO_USE*/; i < availableDevicesNb; i++) { final int index = i; Future<?> initJob = initialization.submit( new Runnable() { public void run() { System.err.println("Initializing device n°" + index); cudaEngines.put(index, new CudaEngine(index)); } }); initJob.get(); initialization.shutdown(); } } catch (InterruptedException | ExecutionException | IOException | CudaException | UnsatisfiedLinkError e) { e.printStackTrace(); System.err.println("---------Cannot initialize Cuda !!! ----------------"); return false; } Runtime.getRuntime() .addShutdownHook( new Thread() { @Override public void run() { CudaEngine.stop(); } }); System.out.println("---------Cuda Initialized----------------"); return true; } } public static boolean isCudaAvailable() { return availableDevicesNb != 0; } public static CudaEngine getCudaEngine(CudaObject co) { synchronized (cudaEngines) { if (!isCudaAvailable()) throw new CudaException("No cuda device found"); try { initialization.awaitTermination(100, TimeUnit.SECONDS); } catch (InterruptedException e) { e.printStackTrace(); } Pheromone p = (Pheromone) co; final int pheroID = cudaObjectID.incrementAndGet(); final CudaEngine ce = cudaEngines.get(pheroID % availableDevicesNb); // final CudaEngine ce = cudaEngines.get(1); // final CudaEngine ce = cudaEngines.get(0); // final CudaEngine ce; // if(p.getName().contains("PRE")){ // ce = cudaEngines.get(0); // } // else{ // ce = cudaEngines.get(1); // } // ce.cudaObjects.add(co); System.err.println(co + "ID " + pheroID + " getting cuda engine Id " + ce.Id); return ce; } } static FloatBuffer getUnifiedFloatBuffer(Pointer pinnedMemory, CUdeviceptr devicePtr, long size) { JCudaDriver.cuMemHostAlloc(pinnedMemory, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP); final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size); byteBuffer.order(ByteOrder.nativeOrder()); JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0); return byteBuffer.asFloatBuffer(); } public static IntBuffer getUnifiedIntBuffer( Pointer pinnedMemory, CUdeviceptr devicePtr, int size) { JCudaDriver.cuMemHostAlloc(pinnedMemory, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP); final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size); byteBuffer.order(ByteOrder.nativeOrder()); JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0); return byteBuffer.asIntBuffer(); } public static int[] getUnifiedIntArray(Pointer pinnedMemory, CUdeviceptr devicePtr, int size) { int[] values = new int[size]; JCudaDriver.cuMemHostAlloc(pinnedMemory, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP); final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size); byteBuffer.order(ByteOrder.nativeOrder()); JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0); return values; } public static ByteBuffer getUnifiedByteBuffer( Pointer pinnedMemory, CUdeviceptr devicePtr, int size) { JCudaDriver.cuMemHostAlloc(pinnedMemory, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP); final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size); byteBuffer.order(ByteOrder.nativeOrder()); JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0); return byteBuffer; } /** Stop the executors and clean memory on registered CUObject */ public static void stop() { synchronized (cudaEngines) { cuCtxSynchronizeAll(); for (Iterator<CudaEngine> iterator = cudaEngines.values().iterator(); iterator.hasNext(); ) { iterator.next().shutdown(); iterator.remove(); } // for (CudaEngine ce : cudaEngines.values()) { // ce.shutdown(); // } } } /** Stop the executors and clean memory on registered CUObject */ public static synchronized void freeMemory() { for (CudaEngine ce : cudaEngines.values()) { ce.freeCUObjectsMemory(); } } static void initModules(CUmodule module) { JCudaDriver.cuModuleLoad(module, new File(ioTmpDir, PHEROMONES_CU + ".ptx").getAbsolutePath()); } // void compileKernelsPtx() static void compileKernelsPtx() throws IOException { if (!new File(ioTmpDir, PHEROMONES_CU + ".ptx").exists()) { // TODO externalize try (InputStream is = CudaEngine.class.getResourceAsStream( "/turtlekit/cuda/kernels/" + PHEROMONES_CU + ".cu")) { final Path path = Paths.get(ioTmpDir, PHEROMONES_CU + ".cu"); try { Files.copy(is, path); } catch (FileAlreadyExistsException e) { } System.err.println("--------------- Compiling ptx ----------------------"); KernelLauncher.create( path.toString(), Kernel.DIFFUSION_TO_TMP.name(), false, "--use_fast_math", "--prec-div=false"); // ,"--gpu-architecture=sm_20"); } catch (IOException e) { throw e; } } } public static synchronized void cuCtxSynchronizeAll() { for (CudaEngine ce : cudaEngines.values()) { ce.cuCtxSynchronize(); } // List<Future<Void>> futures = new ArrayList<Future<Void>>(); // for (CudaEngine ce : executors) { // futures.add(ce.exe.submit(new Callable<Void>() { // @Override // public Void call() throws Exception { // JCudaDriver.cuCtxSynchronize(); // return null; // } // // })); // } // for (Future<Void> future : futures) { // try { // future.get(); // } catch (InterruptedException | ExecutionException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // } } public int cuDeviceGetCount() { return availableDevicesNb; } /** Free memory from the currently registered CUObjects */ public void freeCUObjectsMemory() { exe.submit( new Runnable() { @Override public void run() { cuCtxSynchronize(); for (CudaObject co : cudaObjects) { co.freeMemory(); } JCudaDriver.cuCtxDestroy(context); } }); } private synchronized void shutdown() { if (!exe.isShutdown()) { freeCUObjectsMemory(); } exe.shutdown(); try { System.err.println( "cuda device " + Id + " freed ? " + exe.awaitTermination(10, TimeUnit.SECONDS)); } catch (InterruptedException e) { e.printStackTrace(); } } private void initFunction(CUmodule module, Kernel name) { CUfunction function = new CUfunction(); JCudaDriver.cuModuleGetFunction(function, module, name.name()); kernels.put(name, function); } public int getMaxThreads() { return maxThreads; } public CUfunction getKernelFunction(Kernel f) { CUfunction function = kernels.get(f); if (function == null) throw new CudaException("No such function " + f); return function; } // public static void main(String[] args) { // CudaPheromone cu, cu2; // getCudaEngine(cu = new CudaPheromone(10, 10, 0.1f, 0f, "test")); // getCudaEngine(cu2 = new CudaPheromone(100, 100, 0.3f, 0.5f, "test2")); // cu.set(3, 3, 8); //// cu.diffusion(); // cu.evaporation(); // System.err.println(cu.get(3, 3)); // System.err.println(cu.get(0, 0)); // System.err.println(cu.get(3, 2)); // System.err.println("maxdir " + cu.getMaxDir(3, 2)); // cu.diffusion(); // System.err.println(cu.get(3, 3)); // cu.diffusion(); // System.err.println(cu.get(3, 3)); // cu2.diffusion(); // cu.freeMemory(); // cu2.freeMemory(); // CudaEngine.stop(); // } public void cuCtxSynchronize() { try { exe.submit( new Callable<Void>() { @Override public Void call() throws Exception { JCudaDriver.cuCtxSynchronize(); return null; } }) .get(); } catch (InterruptedException | ExecutionException e) { e.printStackTrace(); } } public Future<?> submit(Runnable runnable) { if (!exe.isShutdown()) { return exe.submit(runnable); } return null; } public CUdeviceptr getNeighborsPtr(String string) { return neigborsPtrs.get(string); } public void addNeighborsPtr(String string, CUdeviceptr neighborsPtr) { neigborsPtrs.put(string, neighborsPtr); } enum Kernel { DIFFUSION_TO_TMP, DIFFUSION_UPDATE, EVAPORATION, FIELD_MAX_DIR, DIFFUSION_UPDATE_THEN_EVAPORATION, DIFFUSION_UPDATE_THEN_EVAPORATION_THEN_FIELDMAXDIR // ,FILL_NEIGHBORS , TEST, DIFFUSION_UPDATE_THEN_EVAPORATION_THEN_FIELDMAXDIRV2 } }