My code section:
// EarlyStopping
EarlyStoppingConfiguration<MultiLayerNetwork> esConf = new EarlyStoppingConfiguration.Builder<MultiLayerNetwork>()
.epochTerminationConditions(new MaxEpochsTerminationCondition(nEpochs), new ScoreImprovementEpochTerminationCondition(earlyStoppingEpochCount)) //
.scoreCalculator(new DataSetLossCalculator(validateDataSetIterator, true)) //
.evaluateEveryNEpochs(1) // 每个epoch都进行评估
// .modelSaver(modelSaver)
.build();
EarlyStoppingResult result = null;
if (mutilGPU) {
EarlyStoppingParallelTrainer trainer = new EarlyStoppingParallelTrainer(esConf, model, trainIterator, null, workersMutilGPU, prefetchBufferMutilGPU, avgFrequencyMutilGPU);
result = trainer.fit();
} else {
EarlyStoppingTrainer trainer = new EarlyStoppingTrainer(esConf, model, trainIterator);
result = trainer.fit();
}
//Print out the results:
System.out.println("Termination reason: " + result.getTerminationReason());
System.out.println("Termination details: " + result.getTerminationDetails());
System.out.println("Total epochs: " + result.getTotalEpochs());
System.out.println("Best epoch number: " + result.getBestModelEpoch());
System.out.println("Score at best epoch: " + result.getBestModelScore());
//Print score vs. epoch
Map<Integer, Double> scoreVsEpoch = result.getScoreVsEpoch();
List<Integer> list = new ArrayList<>(scoreVsEpoch.keySet());
Collections.sort(list);
System.out.println("Score vs. Epoch:");
for (Integer i : list) {
System.out.println("No." + i + " Epoch,Score=" + scoreVsEpoch.get(i));
}
MultiLayerNetwork bestModel = (MultiLayerNetwork) result.getBestModel();
String modelId = getNeuronsStr() + "-" + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"));
this.saveModel(bestModel, modelSaveFileName, modelId);
the exceptions:
UIServer url:http://localhost:9001/
Termination reason: Error
Termination details: java.lang.RuntimeException: java.lang.RuntimeException: java.lang.RuntimeException: Op [adam_updater] execution failed
Total epochs: 1
Best epoch number: 0
Score at best epoch: 0.2750830228317576
Score vs. Epoch:
No.-1 Epoch,Score=0.2750830228317576
save best model,model fileName=/home/cqdl/AIFocusStocks/myData/LbcFE-SZSE-SECOND-ADAP-20240720-91-0.9-30-Tanh-1.0-29f-SQW-2.0-3264_6432-20240822112736.mdl
11:27:36.872 [ParallelWrapper training thread 3] ERROR org.deeplearning4j.parallelism.ParallelWrapper - Uncaught exception: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
11:27:36.823 [ParallelWrapper training thread 0] ERROR org.deeplearning4j.parallelism.ParallelWrapper - Uncaught exception: java.lang.RuntimeException: java.lang.RuntimeException: Op [adam_updater] execution failed
java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
11:27:36.920 [ParallelWrapper training thread 1] ERROR org.deeplearning4j.parallelism.ParallelWrapper - Uncaught exception: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.dbClose(Native Method)
at org.nd4j.nativeblas.OpaqueDataBuffer.closeBuffer(OpaqueDataBuffer.java:219)
at org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer.release(BaseCudaDataBuffer.java:1814)
at org.nd4j.linalg.api.buffer.BaseDataBuffer.close(BaseDataBuffer.java:1946)
at org.nd4j.linalg.api.ndarray.BaseNDArray.close(BaseNDArray.java:5654)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.close(MultiLayerNetwork.java:4148)
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.run(DefaultTrainer.java:452)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at org.deeplearning4j.parallelism.ParallelWrapper$2$1.run(ParallelWrapper.java:156)
at java.base/java.lang.Thread.run(Thread.java:833)
Exception in thread "DeallocatorServiceThread_3" Exception in thread "DeallocatorServiceThread_7" java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.dbClose(Native Method)
at org.nd4j.nativeblas.OpaqueDataBuffer.closeBuffer(OpaqueDataBuffer.java:219)
at org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer.release(BaseCudaDataBuffer.java:1814)
at org.nd4j.linalg.api.buffer.BaseDataBuffer.close(BaseDataBuffer.java:1946)
at org.nd4j.linalg.api.ndarray.BaseNDArray.close(BaseNDArray.java:5654)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.close(MultiLayerNetwork.java:4148)
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.run(DefaultTrainer.java:452)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at org.deeplearning4j.parallelism.ParallelWrapper$2$1.run(ParallelWrapper.java:156)
at java.base/java.lang.Thread.run(Thread.java:833)
Exception in thread "DeallocatorServiceThread_5" Exception in thread "DeallocatorServiceThread_2" java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
Exception in thread "DeallocatorServiceThread_0" java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
Exception in thread "DeallocatorServiceThread_6" Exception in thread "DeallocatorServiceThread_1" java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
Exception in thread "DeallocatorServiceThread_4" java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
java.lang.RuntimeException: java.lang.RuntimeException: Op [adam_updater] execution failed
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.run(DefaultTrainer.java:446)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at org.deeplearning4j.parallelism.ParallelWrapper$2$1.run(ParallelWrapper.java:156)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.RuntimeException: Op [adam_updater] execution failed
at org.nd4j.linalg.jcublas.ops.executioner.CudaExecutioner.exec(CudaExecutioner.java:1881)
at org.nd4j.linalg.factory.Nd4j.exec(Nd4j.java:6545)
at org.nd4j.linalg.learning.AdamUpdater.applyUpdater(AdamUpdater.java:110)
at org.deeplearning4j.nn.updater.UpdaterBlock.update(UpdaterBlock.java:162)
at org.deeplearning4j.nn.updater.UpdaterBlock.updateExternalGradient(UpdaterBlock.java:128)
at org.deeplearning4j.nn.updater.BaseMultiLayerUpdater.update(BaseMultiLayerUpdater.java:320)
at org.deeplearning4j.nn.updater.BaseMultiLayerUpdater.update(BaseMultiLayerUpdater.java:247)
at org.deeplearning4j.optimize.solvers.BaseOptimizer.updateGradientAccordingToParams(BaseOptimizer.java:309)
at org.deeplearning4j.optimize.solvers.BaseOptimizer.gradientAndScore(BaseOptimizer.java:186)
at org.deeplearning4j.optimize.solvers.StochasticGradientDescent.optimize(StochasticGradientDescent.java:61)
at org.deeplearning4j.optimize.Solver.optimize(Solver.java:52)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.fitHelper(MultiLayerNetwork.java:2357)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.fit(MultiLayerNetwork.java:2315)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.fit(MultiLayerNetwork.java:2378)
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.fit(DefaultTrainer.java:233)
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.run(DefaultTrainer.java:382)
... 4 more
Caused by: java.lang.RuntimeException: adamUpdater: cuda stream synchronization failed !; Error code: [700]
at org.nd4j.linalg.jcublas.ops.executioner.CudaExecutioner.exec(CudaExecutioner.java:2067)
at org.nd4j.linalg.jcublas.ops.executioner.CudaExecutioner.exec(CudaExecutioner.java:1870)
... 19 more
java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
java.lang.RuntimeException: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:151)
Caused by: java.lang.RuntimeException: [DEVICE] deallocation failed; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.deleteDataBuffer(Native Method)
at org.nd4j.jita.allocator.impl.CudaDeallocator.deallocate(CudaDeallocator.java:40)
at org.nd4j.linalg.api.memory.deallocation.DeallocatorService$DeallocatorServiceThread.run(DeallocatorService.java:146)
Aug 22, 2024 11:27:36 AM com.cq.aifocusstocks.train.RnnPredictModel saveModel
SEVERE: null
java.lang.RuntimeException: DataBuffer::syncToPrimary failed to to some previous kernel failre; Error code: [700]
at org.nd4j.linalg.jcublas.bindings.Nd4jCuda.dbSyncToPrimary(Native Method)
at org.nd4j.jita.allocator.impl.AtomicAllocator.synchronizeHostData(AtomicAllocator.java:361)
at org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer.write(BaseCudaDataBuffer.java:1319)
at org.nd4j.linalg.factory.Nd4j.write(Nd4j.java:2629)
at org.deeplearning4j.util.ModelSerializer.writeModel(ModelSerializer.java:158)
at org.deeplearning4j.util.ModelSerializer.writeModel(ModelSerializer.java:121)
at org.deeplearning4j.util.ModelSerializer.writeModel(ModelSerializer.java:108)
at com.cq.aifocusstocks.train.RnnPredictModel.saveModel(RnnPredictModel.java:262)
at com.cq.aifocusstocks.train.RnnPredictModel.train(RnnPredictModel.java:217)
at com.cq.aifocusstocks.train.CnnLstmRegPredictor.trainModel(CnnLstmRegPredictor.java:245)
at com.cq.aifocusstocks.train.TrainCnnLstmModel.main(TrainCnnLstmModel.java:15)