Dl4j cuda 11.2 running out of memory on evaluation on ubuntu 20.04

here is my pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>SampleNd4j</groupId>
  <artifactId>SampleNd4j</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <build>
    <sourceDirectory>src</sourceDirectory>
    <plugins>
      <plugin>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.8.1</version>
        <configuration>
          <release>11</release>
        </configuration>
      </plugin>
    </plugins>
  </build>
  <properties>
  		<nd4j.cpu.backend>nd4j-native-platform</nd4j.cpu.backend>
  		<nd4j.cpu.backend2>nd4j-native</nd4j.cpu.backend2>
        <nd4j.gpu.backend>nd4j-cuda-10.2-platform</nd4j.gpu.backend>
        <dl4j.gpu.backend>deeplearning4j-cuda-10.2</dl4j.gpu.backend>
        <cuda.redist.10.1.version>10.1-7.6-1.5.2</cuda.redist.10.1.version>
        <cuda.redist.10.2.version>10.2-7.6-1.5.3</cuda.redist.10.2.version>
        <dl4j.version>1.0.0-SNAPSHOT</dl4j.version>
        <ffmpeg.version>3.2.1-1.3</ffmpeg.version>
        <javacv.version>1.4.1</javacv.version>
        <logback.version>1.1.7</logback.version> 
        <jackson.version>2.9.6</jackson.version>
    </properties>
    <repositories>
    <repository>
        <id>snapshots-repo</id>
        <url>https://oss.sonatype.org/content/repositories/snapshots</url>
        <releases>
            <enabled>false</enabled>
        </releases>
        <snapshots>
            <enabled>true</enabled>
            <updatePolicy>daily</updatePolicy>  <!-- Optional, update daily -->
        </snapshots>
    </repository>
</repositories>
<dependencies>
  	<dependency>
         <groupId>org.bytedeco</groupId>
         <artifactId>opencv-platform</artifactId>
         <version>4.5.1-1.5.5</version>
     </dependency>
     
     <!-- Jackson dependencies -->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>${jackson.version}</version>
        </dependency>
		<dependency>
          <groupId>com.fasterxml.jackson.core</groupId>
		  <artifactId>jackson-databind</artifactId>
		  <version>${jackson.version}</version>
        </dependency>
     
     <!-- Log dependency -->
		<dependency>
		    <groupId>org.slf4j</groupId>
		    <artifactId>slf4j-api</artifactId>
		    <version>1.7.25</version>
		</dependency>                                                       
        <dependency>                                                            
            <groupId>ch.qos.logback</groupId>                                   
            <artifactId>logback-classic</artifactId>                            
            <version>${logback.version}</version>                               
        </dependency>
       
       
       <!-- deeplearning4j-core: contains main functionality and neural networks -->
        <dependency>
            <groupId>org.deeplearning4j</groupId>
            <artifactId>deeplearning4j-core</artifactId>
            <version>${dl4j.version}</version>
        </dependency>
        <dependency>
            <groupId>org.nd4j</groupId>
            <artifactId>nd4j-native</artifactId>
            <version>${dl4j.version}</version>            
        </dependency>
        <dependency>
            <groupId>org.deeplearning4j</groupId>
            <artifactId>deeplearning4j-cuda-11.2</artifactId>
            <version>${dl4j.version}</version>
        </dependency>
        <dependency>
            <groupId>org.nd4j</groupId>
            <artifactId>nd4j-cuda-11.2</artifactId>
            <version>${dl4j.version}</version>
        </dependency> 
        
        <!-- ParallelWrapper & ParallelInference live here -->
        <dependency>
            <groupId>org.deeplearning4j</groupId>
            <artifactId>deeplearning4j-parallel-wrapper</artifactId>
            <version>${dl4j.version}</version>
        </dependency> 
        <dependency>
            <groupId>org.datavec</groupId>
            <artifactId>datavec-data-image</artifactId>
            <version>${dl4j.version}</version>
        </dependency> 
        <dependency>
            <groupId>org.datavec</groupId>
            <artifactId>datavec-local</artifactId>
            <version>${dl4j.version}</version>
        </dependency>     
     <dependency>
            <groupId>org.deeplearning4j</groupId>
            <artifactId>deeplearning4j-zoo</artifactId>
            <version>${dl4j.version}</version>
        </dependency>
  </dependencies>
</project>

and here is my tes project

package com.sample.ui;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Random;
import java.util.concurrent.TimeUnit;

import org.datavec.api.io.filters.BalancedPathFilter;
import org.datavec.api.io.labels.ParentPathLabelGenerator;
import org.datavec.api.split.FileSplit;
import org.datavec.api.split.InputSplit;
import org.datavec.image.loader.BaseImageLoader;
import org.datavec.image.recordreader.ImageRecordReader;
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
import org.deeplearning4j.earlystopping.EarlyStoppingConfiguration;
import org.deeplearning4j.earlystopping.EarlyStoppingResult;
import org.deeplearning4j.earlystopping.saver.LocalFileGraphSaver;
import org.deeplearning4j.earlystopping.scorecalc.DataSetLossCalculator;
import org.deeplearning4j.earlystopping.termination.MaxEpochsTerminationCondition;
import org.deeplearning4j.earlystopping.termination.MaxTimeIterationTerminationCondition;
import org.deeplearning4j.earlystopping.termination.ScoreImprovementEpochTerminationCondition;
import org.deeplearning4j.earlystopping.trainer.EarlyStoppingGraphTrainer;
import org.deeplearning4j.earlystopping.trainer.IEarlyStoppingTrainer;
import org.deeplearning4j.nn.api.OptimizationAlgorithm;
import org.deeplearning4j.nn.conf.WorkspaceMode;
import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
import org.deeplearning4j.nn.conf.layers.OutputLayer;
import org.deeplearning4j.nn.graph.ComputationGraph;
import org.deeplearning4j.nn.transferlearning.FineTuneConfiguration;
import org.deeplearning4j.nn.transferlearning.TransferLearning;
import org.deeplearning4j.optimize.listeners.ScoreIterationListener;
import org.deeplearning4j.util.ModelSerializer;
import org.deeplearning4j.zoo.PretrainedType;
import org.deeplearning4j.zoo.ZooModel;
import org.deeplearning4j.zoo.model.VGG16;
import org.nd4j.evaluation.classification.Evaluation;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.dataset.api.preprocessor.VGG16ImagePreProcessor;
import org.nd4j.linalg.learning.config.Nesterovs;
import org.nd4j.linalg.lossfunctions.LossFunctions;
import org.slf4j.Logger;

public class TrainProgram {

    public static final long seed = 1234;
    public static final Random RAND_NUM_GEN = new Random(seed);
    public static final String[] ALLOWED_FORMATS = BaseImageLoader.ALLOWED_FORMATS;
    public static ParentPathLabelGenerator LABEL_GENERATOR_MAKER = new ParentPathLabelGenerator();
    public static BalancedPathFilter PATH_FILTER = new BalancedPathFilter(RAND_NUM_GEN, ALLOWED_FORMATS, LABEL_GENERATOR_MAKER);

    protected static final Logger LOGGER = org.slf4j.LoggerFactory.getLogger(TrainProgram.class);
    protected static final int TRAIN_SIZE = 85;
    protected static final int BATCH_SIZE = 32;
    protected static final int EPOCH = 30;

    public static void main(String[] args) throws IOException {
        String homePath = System.getProperty("user.home");
        LOGGER.info(homePath);

        Path datasetPath = Paths.get(homePath, "dataset");
        Path trainPath = Paths.get(datasetPath.toString(), "sample-data");

        File parentDir = trainPath.toFile();
        FileSplit filesInDir = new FileSplit(parentDir, ALLOWED_FORMATS, RAND_NUM_GEN);
        InputSplit[] filesInDirSplit = filesInDir.sample(PATH_FILTER, TRAIN_SIZE, 100 - TRAIN_SIZE);

        DataSetIterator trainIter = makeIterator(filesInDirSplit[0]);
        DataSetIterator testIter = makeIterator(filesInDirSplit[1]);

        ZooModel objZooModel = VGG16.builder().workspaceMode(WorkspaceMode.ENABLED).build();
        ComputationGraph preTrainedNet = (ComputationGraph) objZooModel.initPretrained(PretrainedType.IMAGENET);
        LOGGER.info(preTrainedNet.summary());

        FineTuneConfiguration fineTuneConf = new FineTuneConfiguration.Builder()
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
            .updater(new Nesterovs(5e-5))
            .seed(seed)
            .inferenceWorkspaceMode(WorkspaceMode.ENABLED)
            .trainingWorkspaceMode(WorkspaceMode.ENABLED)
            .build();

        String FREEZE_UNTIL_LAYER = "fc2";
        String OUTPUT_LAYER = "predictions";
        int INPUT_LAYER_PARAM = 4096;
        int numClasses = trainIter.getLabels().size();

        ComputationGraph vgg16Transfer = new TransferLearning.GraphBuilder(preTrainedNet)
            .fineTuneConfiguration(fineTuneConf)
            .setFeatureExtractor(FREEZE_UNTIL_LAYER)
            .removeVertexKeepConnections(OUTPUT_LAYER)
            .setWorkspaceMode(WorkspaceMode.ENABLED)
            .addLayer(OUTPUT_LAYER,
                new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD)
                .nIn(INPUT_LAYER_PARAM).nOut(numClasses)
                //.weightInit(WeightInit.XAVIER)
                .weightInit(new NormalDistribution(0, 0.2 * (2.0 / (INPUT_LAYER_PARAM + numClasses)))) //This weight init dist gave better results than Xavier
                .activation(Activation.SOFTMAX).build(), FREEZE_UNTIL_LAYER)
            .build();
        vgg16Transfer.setListeners(new ScoreIterationListener(5));
        LOGGER.info(vgg16Transfer.summary());

        EarlyStoppingConfiguration < ComputationGraph > esConfig = new EarlyStoppingConfiguration.Builder < ComputationGraph > ()
            .epochTerminationConditions(new MaxEpochsTerminationCondition(EPOCH),
                new ScoreImprovementEpochTerminationCondition(5, 0))
            .iterationTerminationConditions(
                new MaxTimeIterationTerminationCondition(24, TimeUnit.HOURS)) //new MaxScoreIterationTerminationCondition(150.5)
            .scoreCalculator(new DataSetLossCalculator(testIter, true))
            .evaluateEveryNEpochs(1)
            .modelSaver(new LocalFileGraphSaver(datasetPath.toString()))
            .build();

        IEarlyStoppingTrainer < ComputationGraph > trainer = new EarlyStoppingGraphTrainer(esConfig, vgg16Transfer, trainIter);
        EarlyStoppingResult < ComputationGraph > result = trainer.fit();

        //Print out the results:
        LOGGER.info("Termination reason: " + result.getTerminationReason());
        LOGGER.info("Termination details: " + result.getTerminationDetails());
        LOGGER.info("Total epochs: " + result.getTotalEpochs());
        LOGGER.info("Best epoch number: " + result.getBestModelEpoch());
        LOGGER.info("Score at best epoch: " + result.getBestModelScore());

        ComputationGraph bestModel = result.getBestModel();
        evalOn(bestModel, testIter, 0);

        Path fullPath = Paths.get(datasetPath.toString(), "model.zip");
        ModelSerializer.writeModel(bestModel, fullPath.toFile(), false);

        LOGGER.info("END");
    }

    public static DataSetIterator makeIterator(InputSplit split) throws IOException {
        int channels = 3;
        int width = 224;
        int height = 224;

        ImageRecordReader recordReader = new ImageRecordReader(height, width, channels, LABEL_GENERATOR_MAKER);
        recordReader.initialize(split);

        DataSetIterator iter = new RecordReaderDataSetIterator(recordReader, BATCH_SIZE, 1, 1, true);
        iter.setPreProcessor(new VGG16ImagePreProcessor());

        return iter;
    }

    public static boolean evalOn(ComputationGraph graph, DataSetIterator testIterator, int iEpoch) throws IOException {
        boolean result = true;
        try {
            LOGGER.info("Evaluate model at iteration " + iEpoch + " ....");
            Evaluation eval = graph.evaluate(testIterator);
            LOGGER.info(eval.stats());
            testIterator.reset();
        } catch (OutOfMemoryError e) {
            System.gc();
            LOGGER.info("Error: ", e);
            result = false;
        }

        return result;
    }
}

Each time I run this code I run out of mempry. I am using RTX 3060 with 12GB RAM and here are my memory settings: -Xms2G -Xmx4G -Dorg.bytedeco.javacpp.maxbytes=6G
Any guidance on how to fix this will be greatly appreciated

Can you also share the memory info file that is written when that happens?

here is the memory crush dump

Deeplearning4j OOM Exception Encountered for ComputationGraph
Timestamp:                              2021-05-10 09:57:23.347
Thread ID                               1
Thread Name                             main


Stack Trace:
java.lang.OutOfMemoryError: Failed to allocate memory within limits: totalBytes (7074M + 9792M) > maxBytes (7168M)
	at org.bytedeco.javacpp.Pointer.deallocator(Pointer.java:696)
	at org.deeplearning4j.cuda.BaseCudnnHelper$DataCache.<init>(BaseCudnnHelper.java:129)
	at org.deeplearning4j.cuda.convolution.CudnnConvolutionHelper.preOutput(CudnnConvolutionHelper.java:544)
	at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.preOutput(ConvolutionLayer.java:425)
	at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.activate(ConvolutionLayer.java:522)
	at org.deeplearning4j.nn.layers.FrozenLayer.activate(FrozenLayer.java:82)
	at org.deeplearning4j.nn.graph.vertex.impl.LayerVertex.doForward(LayerVertex.java:110)
	at org.deeplearning4j.nn.graph.ComputationGraph.ffToLayerActivationsDetached(ComputationGraph.java:1974)
	at org.deeplearning4j.nn.graph.ComputationGraph.scoreHelper(ComputationGraph.java:3113)
	at org.deeplearning4j.nn.graph.ComputationGraph.score(ComputationGraph.java:3080)
	at org.deeplearning4j.nn.graph.ComputationGraph.score(ComputationGraph.java:3068)
	at org.deeplearning4j.earlystopping.scorecalc.DataSetLossCalculator.scoreMinibatch(DataSetLossCalculator.java:95)
	at org.deeplearning4j.earlystopping.scorecalc.base.BaseScoreCalculator.scoreMinibatch(BaseScoreCalculator.java:89)
	at org.deeplearning4j.earlystopping.scorecalc.base.BaseScoreCalculator.calculateScore(BaseScoreCalculator.java:59)
	at org.deeplearning4j.earlystopping.trainer.BaseEarlyStoppingTrainer.fit(BaseEarlyStoppingTrainer.java:243)
	at org.deeplearning4j.earlystopping.trainer.BaseEarlyStoppingTrainer.fit(BaseEarlyStoppingTrainer.java:94)
	at com.sample.ui.TrainProgram.main(TrainProgram.java:116)


========== Memory Information ==========
----- Version Information -----
Deeplearning4j Version                  <could not determine>
Deeplearning4j CUDA                     <not present>

----- System Information -----
Operating System                        GNU/Linux Ubuntu 20.04.2 LTS
CPU                                     Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz
CPU Cores - Physical                    12
CPU Cores - Logical                     24
Total System Memory                      31,28 GiB (33591095296)
Number of GPUs Detected                 1
  Name                           CC                Total Memory              Used Memory              Free Memory
  GeForce RTX 3060               8.6    11,77 GiB (12636192768)    8,76 GiB (9405267968)    3,01 GiB (3230924800)

----- ND4J Environment Information -----
Data Type                               FLOAT
blas.vendor                             CUBLAS
os                                      Linux
backend                                 CUDA

----- Memory Configuration -----
JVM Memory: XMX                           5,00 GiB (5368709120)
JVM Memory: current                       2,00 GiB (2147483648)
JavaCPP Memory: Max Bytes                 7,00 GiB (7516192768)
JavaCPP Memory: Max Physical             12,00 GiB (12884901888)
JavaCPP Memory: Current Bytes             6,91 GiB (7417627664)
JavaCPP Memory: Current Physical         11,30 GiB (12134617088)
Periodic GC Enabled                     false

----- Workspace Information -----
Workspaces: # for current thread        4
Current thread workspaces:
  Name                      State       Size                          # Cycles            
  WS_LAYER_WORKING_MEM      CLOSED      522,24 KiB (534773)           9505                
  WS_ALL_LAYERS_ACT         CLOSED        1,89 GiB (2027569555)       936                 
  WS_LAYER_ACT_0            CLOSED      392,00 MiB (411041792)        807                 
  WS_LAYER_ACT_1            CLOSED      399,84 MiB (419262627)        495                 
Workspaces total size                     2,66 GiB (2858408747)
Helper Workspaces
  CUDNN_WORKSPACE                         884,25 MiB (927203328)

----- Network Information -----
Network # Parameters                    134272835
Parameter Memory                        512,21 MiB (537091340)
Parameter Gradients Memory              512,21 MiB (537091340)
Updater Number of Elements              12291
Updater Memory                           48,01 KiB (49164)
Updater Classes:
  org.nd4j.linalg.learning.NesterovsUpdater
  org.nd4j.linalg.learning.NoOpUpdater
Params + Gradient + Updater Memory      512,26 MiB (537140504)
Iteration Count                         312
Epoch Count                             3
Backprop Type                           Standard
Workspace Mode: Training                ENABLED
Workspace Mode: Inference               ENABLED
Number of Layers                        21
Layer Counts
  FrozenLayer                             20
  OutputLayer                             1
Layer Parameter Breakdown
  Idx Name                 Layer Type           Layer # Parameters   Layer Parameter Memory
  1   block1_conv1         FrozenLayer          1792                   7,00 KiB (7168)   
  2   block1_conv2         FrozenLayer          36928                144,25 KiB (147712) 
  3   block1_pool          FrozenLayer          0                         ,00 B          
  4   block2_conv1         FrozenLayer          73856                288,50 KiB (295424) 
  5   block2_conv2         FrozenLayer          147584               576,50 KiB (590336) 
  6   block2_pool          FrozenLayer          0                         ,00 B          
  7   block3_conv1         FrozenLayer          295168                 1,13 MiB (1180672)
  8   block3_conv2         FrozenLayer          590080                 2,25 MiB (2360320)
  9   block3_conv3         FrozenLayer          590080                 2,25 MiB (2360320)
  10  block3_pool          FrozenLayer          0                         ,00 B          
  11  block4_conv1         FrozenLayer          1180160                4,50 MiB (4720640)
  12  block4_conv2         FrozenLayer          2359808                9,00 MiB (9439232)
  13  block4_conv3         FrozenLayer          2359808                9,00 MiB (9439232)
  14  block4_pool          FrozenLayer          0                         ,00 B          
  15  block5_conv1         FrozenLayer          2359808                9,00 MiB (9439232)
  16  block5_conv2         FrozenLayer          2359808                9,00 MiB (9439232)
  17  block5_conv3         FrozenLayer          2359808                9,00 MiB (9439232)
  18  block5_pool          FrozenLayer          0                         ,00 B          
  20  fc1                  FrozenLayer          102764544            392,02 MiB (411058176)
  21  fc2                  FrozenLayer          16781312              64,02 MiB (67125248)
  22  predictions          OutputLayer          12291                 48,01 KiB (49164)  

----- Layer Helpers - Memory Use -----
Total Helper Count                      18
Helper Count w/ Memory                  0
Total Helper Persistent Memory Use           ,00 B

----- Network Activations: Inferred Activation Shapes -----
Current Minibatch Size                  32
Current Input Shape (Input 0)           [32, 3, 224, 224]
Idx Name                 Layer Type           Activations Type                           Activations Shape    # Elements   Memory      
0   input_1              InputVertex          InputTypeConvolutional(h=224,w=224,c=3,NCHW) [32, 3, 224, 224]    4816896       18,38 MiB (19267584)
1   block1_conv1         FrozenLayer          InputTypeConvolutional(h=224,w=224,c=64,NCHW) [32, 64, 224, 224]   102760448    392,00 MiB (411041792)
2   block1_conv2         FrozenLayer          InputTypeConvolutional(h=224,w=224,c=64,NCHW) [32, 64, 224, 224]   102760448    392,00 MiB (411041792)
3   block1_pool          FrozenLayer          InputTypeConvolutional(h=112,w=112,c=64,NCHW) [32, 64, 112, 112]   25690112      98,00 MiB (102760448)
4   block2_conv1         FrozenLayer          InputTypeConvolutional(h=112,w=112,c=128,NCHW) [32, 128, 112, 112]  51380224     196,00 MiB (205520896)
5   block2_conv2         FrozenLayer          InputTypeConvolutional(h=112,w=112,c=128,NCHW) [32, 128, 112, 112]  51380224     196,00 MiB (205520896)
6   block2_pool          FrozenLayer          InputTypeConvolutional(h=56,w=56,c=128,NCHW) [32, 128, 56, 56]    12845056      49,00 MiB (51380224)
7   block3_conv1         FrozenLayer          InputTypeConvolutional(h=56,w=56,c=256,NCHW) [32, 256, 56, 56]    25690112      98,00 MiB (102760448)
8   block3_conv2         FrozenLayer          InputTypeConvolutional(h=56,w=56,c=256,NCHW) [32, 256, 56, 56]    25690112      98,00 MiB (102760448)
9   block3_conv3         FrozenLayer          InputTypeConvolutional(h=56,w=56,c=256,NCHW) [32, 256, 56, 56]    25690112      98,00 MiB (102760448)
10  block3_pool          FrozenLayer          InputTypeConvolutional(h=28,w=28,c=256,NCHW) [32, 256, 28, 28]    6422528       24,50 MiB (25690112)
11  block4_conv1         FrozenLayer          InputTypeConvolutional(h=28,w=28,c=512,NCHW) [32, 512, 28, 28]    12845056      49,00 MiB (51380224)
12  block4_conv2         FrozenLayer          InputTypeConvolutional(h=28,w=28,c=512,NCHW) [32, 512, 28, 28]    12845056      49,00 MiB (51380224)
13  block4_conv3         FrozenLayer          InputTypeConvolutional(h=28,w=28,c=512,NCHW) [32, 512, 28, 28]    12845056      49,00 MiB (51380224)
14  block4_pool          FrozenLayer          InputTypeConvolutional(h=14,w=14,c=512,NCHW) [32, 512, 14, 14]    3211264       12,25 MiB (12845056)
15  block5_conv1         FrozenLayer          InputTypeConvolutional(h=14,w=14,c=512,NCHW) [32, 512, 14, 14]    3211264       12,25 MiB (12845056)
16  block5_conv2         FrozenLayer          InputTypeConvolutional(h=14,w=14,c=512,NCHW) [32, 512, 14, 14]    3211264       12,25 MiB (12845056)
17  block5_conv3         FrozenLayer          InputTypeConvolutional(h=14,w=14,c=512,NCHW) [32, 512, 14, 14]    3211264       12,25 MiB (12845056)
18  block5_pool          FrozenLayer          InputTypeConvolutional(h=7,w=7,c=512,NCHW) [32, 512, 7, 7]      802816         3,06 MiB (3211264)
19  flatten              PreprocessorVertex   InputTypeFeedForward(25088)                [32, 25088]          802816         3,06 MiB (3211264)
20  fc1                  FrozenLayer          InputTypeFeedForward(4096)                 [32, 4096]           131072       512,00 KiB (524288)
21  fc2                  FrozenLayer          InputTypeFeedForward(4096)                 [32, 4096]           131072       512,00 KiB (524288)
22  predictions          OutputLayer          InputTypeFeedForward(3)                    [32, 3]              96             384,00 B  
Total Activations Memory                  1,82 GiB (1953497472)
Total Activation Gradient Memory          1,82 GiB (1953497088)

----- Network Training Listeners -----
Number of Listeners                     1
Listener 0                              ScoreIterationListener(5)

@ajmakoni your pom.xml says cuda 10.2. Are you sure you’re using 11.2 and snapshots?

Hi @agibsonccc,

Yes, we are using cuda 11.2

Text from log file:

2021-05-10 18:53:38 [main] INFO org.nd4j.linalg.factory.Nd4jBackend - Loaded [JCublasBackend] backend
2021-05-10 18:53:38 [main] ERROR o.n.common.config.ND4JClassLoading - Cannot find class [org.nd4j.linalg.jblas.JblasBackend] of provided class-loader.
2021-05-10 18:53:38 [main] ERROR o.n.common.config.ND4JClassLoading - Cannot find class [org.canova.api.io.data.DoubleWritable] of provided class-loader.
2021-05-10 18:53:38 [main] ERROR o.n.common.config.ND4JClassLoading - Cannot find class [org.nd4j.linalg.jblas.JblasBackend] of provided class-loader.
2021-05-10 18:53:38 [main] ERROR o.n.common.config.ND4JClassLoading - Cannot find class [org.canova.api.io.data.DoubleWritable] of provided class-loader.
2021-05-10 18:53:47 [main] INFO org.nd4j.nativeblas.NativeOpsHolder - Number of threads used for linear algebra: 32
2021-05-10 18:53:47 [main] INFO o.n.l.a.o.e.DefaultOpExecutioner - Backend used: [CUDA]; OS: [Linux]
2021-05-10 18:53:47 [main] INFO o.n.l.a.o.e.DefaultOpExecutioner - Cores: [24]; Memory: [5,0GB];
2021-05-10 18:53:47 [main] INFO o.n.l.a.o.e.DefaultOpExecutioner - Blas vendor: [CUBLAS]
2021-05-10 18:53:47 [main] INFO o.n.linalg.jcublas.JCublasBackend - ND4J CUDA build version: 11.2.152
2021-05-10 18:53:47 [main] INFO o.n.linalg.jcublas.JCublasBackend - CUDA device 0: [GeForce RTX 3060]; cc: [8.6]; Total memory: [12636192768]
2021-05-10 18:53:47 [main] INFO o.n.linalg.jcublas.JCublasBackend - Backend build information:
GCC: “7.5.0”
STD version: 201402L
CUDA: 11.2.152
DEFAULT_ENGINE: samediff::ENGINE_CUDA
HAVE_FLATBUFFERS
2021-05-10 18:54:45 [main] INFO o.d.nn.graph.ComputationGraph - Starting ComputationGraph with WorkspaceModes set to [training: NONE; inference: SINGLE], cacheMode set to [NONE]

Thanks for your comment.

Ok great. Sorry it wasn’t clear from the post there at the top.
I just looked at your pom.xml again.

A work around is usually to just call System.gc(); manually.
Sometimes if you allocate a lot of big buffers up front, the JVM won’t have time to collect the off heap memory.
Did you try both with and without cudnn?

From the output of the memory log it also looks like the periodic gc is disabled.

Given that your log also says that you’ve disabled workspaces for training, that looks like a bad combination.

I suggest you enable the periodic gc. See https://deeplearning4j.konduit.ai/config/config-memory/config-workspaces#garbage-collector

Thanks @treo and @agibsonccc , we will follow your recommendations.

Hey @treo and @agibsonccc we enabled the workspaces and the periodic gc and here are out memory settings:
-Xms2G -Xmx9G -Dorg.bytedeco.javacpp.maxbytes=11G -XX:+UseG1GC -Dorg.bytedeco.javacpp.maxphysicalbytes=12G -verbose:gc
this is our gcc setting as per recommendation from earlier response

Nd4j.getMemoryManager().togglePeriodicGc(true);
            Nd4j.getMemoryManager().setAutoGcWindow(5000);

and this is our custom class that we created for this test:

var xx = new IterationTerminationCondition() {
			int counter = 0;
			
			@Override
			public boolean terminate(double lastMiniBatchScore) {
				
				if(counter == 5) {
					LOGGER.info("GC start.");
					System.gc();
					LOGGER.info("GC END.");
					counter = 0;
				}
				
				counter++;
				return false;
			}
			
			@Override
			public void initialize() {				
			}
		};

However, our training fails at evaluation of first epoch. It seems that periodic gc collection is failing during epoch evaluation:

Evaluation eval = graph.evaluate(testIterator);

Here is our crash report

Exception in thread "main" java.lang.RuntimeException: Error during neural network forward pass
	at org.deeplearning4j.nn.graph.ComputationGraph.outputOfLayersDetached(ComputationGraph.java:2526)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluationHelper(ComputationGraph.java:4234)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluationHelper(ComputationGraph.java:4189)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluation(ComputationGraph.java:4147)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluation(ComputationGraph.java:4134)
	at org.deeplearning4j.nn.graph.ComputationGraph.evaluate(ComputationGraph.java:3979)
	at org.deeplearning4j.nn.graph.ComputationGraph.evaluate(ComputationGraph.java:3947)
	at org.deeplearning4j.nn.graph.ComputationGraph.evaluate(ComputationGraph.java:3926)
	at com.sample.ui.TrainProgram.evalOn(TrainProgram.java:201)
	at com.sample.ui.TrainProgram.main(TrainProgram.java:175)
Caused by: java.lang.OutOfMemoryError: Failed to allocate memory within limits: totalBytes (7080M + 7074M) > maxBytes (11264M)
	at org.bytedeco.javacpp.Pointer.deallocator(Pointer.java:696)
	at org.deeplearning4j.cuda.BaseCudnnHelper$DataCache.<init>(BaseCudnnHelper.java:129)
	at org.deeplearning4j.cuda.convolution.CudnnConvolutionHelper.preOutput(CudnnConvolutionHelper.java:544)
	at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.preOutput(ConvolutionLayer.java:425)
	at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.activate(ConvolutionLayer.java:522)
	at org.deeplearning4j.nn.layers.FrozenLayer.activate(FrozenLayer.java:82)
	at org.deeplearning4j.nn.graph.vertex.impl.LayerVertex.doForward(LayerVertex.java:110)
	at org.deeplearning4j.nn.graph.ComputationGraph.outputOfLayersDetached(ComputationGraph.java:2416)
	... 9 more

Looking forward to your assistance.

@ajmakoni Try calling System.gc() before the eval(…). Like I said, sometimes even with the periodic gc you’ll hit a race condition where it doesn’t collect in time. You can also try setting the gc period to longer.
It really depends on what your actual constraints are.

Thank you @agibsonccc for your quick response. A quick test with System.gc() set before eval() is moving past first epoch. This change is now producing 8 epochs. This is the new crash report:

11:30:28.400 [main] INFO com.sample.ui.TrainProgram - Total epochs: 8
11:30:28.400 [main] INFO com.sample.ui.TrainProgram - Best epoch number: 2
11:30:28.400 [main] INFO com.sample.ui.TrainProgram - Score at best epoch: 0.09629346433095051
11:30:28.400 [main] INFO com.sample.ui.TrainProgram - Evaluate model at iteration 0 ....
[1914.803s][info][gc] GC(266) Pause Full (System.gc()) 1695M->40M(2048M) 17.839ms
[1915.863s][info][gc] GC(267) Pause Full (System.gc()) 43M->40M(2048M) 20.110ms
[1915.992s][info][gc] GC(268) Pause Full (System.gc()) 40M->40M(2048M) 18.692ms
[1916.111s][info][gc] GC(269) Pause Full (System.gc()) 40M->40M(2048M) 17.600ms
[1916.230s][info][gc] GC(270) Pause Full (System.gc()) 40M->40M(2048M) 18.157ms
[1916.351s][info][gc] GC(271) Pause Full (System.gc()) 40M->40M(2048M) 19.074ms
[1916.470s][info][gc] GC(272) Pause Full (System.gc()) 40M->40M(2048M) 17.940ms
[1916.589s][info][gc] GC(273) Pause Full (System.gc()) 40M->40M(2048M) 17.450ms
[1916.709s][info][gc] GC(274) Pause Full (System.gc()) 40M->40M(2048M) 18.889ms
[1916.829s][info][gc] GC(275) Pause Full (System.gc()) 40M->40M(2048M) 17.946ms
[1916.950s][info][gc] GC(276) Pause Full (System.gc()) 40M->40M(2048M) 19.492ms
Exception in thread "main" java.lang.RuntimeException: Error during neural network forward pass
	at org.deeplearning4j.nn.graph.ComputationGraph.outputOfLayersDetached(ComputationGraph.java:2526)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluationHelper(ComputationGraph.java:4234)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluationHelper(ComputationGraph.java:4189)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluation(ComputationGraph.java:4147)
	at org.deeplearning4j.nn.graph.ComputationGraph.doEvaluation(ComputationGraph.java:4134)
	at org.deeplearning4j.nn.graph.ComputationGraph.evaluate(ComputationGraph.java:3979)
	at org.deeplearning4j.nn.graph.ComputationGraph.evaluate(ComputationGraph.java:3947)
	at org.deeplearning4j.nn.graph.ComputationGraph.evaluate(ComputationGraph.java:3926)
	at com.sample.ui.TrainProgram.evalOn(TrainProgram.java:202)
	at com.sample.ui.TrainProgram.main(TrainProgram.java:175)
Caused by: java.lang.OutOfMemoryError: Failed to allocate memory within limits: totalBytes (7080M + 7074M) > maxBytes (11264M)
	at org.bytedeco.javacpp.Pointer.deallocator(Pointer.java:696)
	at org.deeplearning4j.cuda.BaseCudnnHelper$DataCache.<init>(BaseCudnnHelper.java:129)
	at org.deeplearning4j.cuda.convolution.CudnnConvolutionHelper.preOutput(CudnnConvolutionHelper.java:544)
	at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.preOutput(ConvolutionLayer.java:425)
	at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.activate(ConvolutionLayer.java:522)
	at org.deeplearning4j.nn.layers.FrozenLayer.activate(FrozenLayer.java:82)
	at org.deeplearning4j.nn.graph.vertex.impl.LayerVertex.doForward(LayerVertex.java:110)
	at org.deeplearning4j.nn.graph.ComputationGraph.outputOfLayersDetached(ComputationGraph.java:2416)
	... 9 more

Will try further test with your suggestions of setting gc period longer. will post test results here accordingly

Hi guys,

To be a little more clear on the steps that we did, here are the details below:

We are using these memory settings:

When we enable the automatic gc, we don’t get error in the first epoch evaluation

And if we use this custom class, the train do 8 epochs without problems

But it is failing here

For now we are catching this error and save the model, but we are testing the same code on Windows with cuda 10.2 and video card GTX 1080 ti and we dont have this problem. Also you suggested:

You can also try setting the gc period to longer.

are you able to explain more on how to achieve this?
Thank you for your continued and timely assistance

@ajmakoni Thanks for following up and asking. On the gc period, all you need to do is set the auto gc windows to be more frequent. That’s actually what I meant. Sorry for the lack of clarity there.

What is your actual batch size you’re trying to test against?

Hey, we are using batch size 32. thank you for the clarity

@agibsonccc @treo hey guys, thank you for the update to dl4j. I have updated my test project to use the latest release: 1.0.0-M1.1 from 1.0.0-M1 and now my sample test project is running all epochs and evaluation without out of memory error. However, the test is running quiet slow.

@ajmakoni that’s great to hear! Did you enable cudnn? Cudnn - Deeplearning4j

@agibsonccc thank you for the quick response. As you can see in my screenshot attached, I have cuddn 8.1 installed and enabled

Here is my pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>SampleNd4j</groupId>
	<artifactId>SampleNd4j</artifactId>
	<version>1.0.0-M1.1</version>
	<build>
		<sourceDirectory>src</sourceDirectory>
		<plugins>
			<plugin>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>3.8.1</version>
				<configuration>
					<release>11</release>
				</configuration>
			</plugin>
		</plugins>
	</build>
	<properties>
		<nd4j.cpu.backend>nd4j-native-platform</nd4j.cpu.backend>
		<nd4j.cpu.backend2>nd4j-native</nd4j.cpu.backend2>
		<nd4j.gpu.backend>nd4j-cuda-11.2-platform</nd4j.gpu.backend>
		<dl4j.gpu.backend>deeplearning4j-cuda-11.2</dl4j.gpu.backend>
		<!-- <cuda.redist.10.1.version>10.1-7.6-1.5.2</cuda.redist.10.1.version> 
			<cuda.redist.10.2.version>10.2-7.6-1.5.3</cuda.redist.10.2.version> -->
		<cuda.redist.11.2.version>11.2-8.1-1.5.5</cuda.redist.11.2.version>
		<dl4j.version>1.0.0-M1.1</dl4j.version>
		<ffmpeg.version>3.2.1-1.3</ffmpeg.version>
		<javacv.version>1.4.1</javacv.version>
		<logback.version>1.1.7</logback.version>
		<jackson.version>2.9.6</jackson.version>
	</properties>

	<dependencies>
		<dependency>
			<groupId>org.bytedeco</groupId>
			<artifactId>opencv-platform</artifactId>
			<version>4.5.1-1.5.5</version>
		</dependency>

		<!-- Jackson dependencies -->
		<dependency>
			<groupId>com.fasterxml.jackson.core</groupId>
			<artifactId>jackson-core</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>com.fasterxml.jackson.core</groupId>
			<artifactId>jackson-databind</artifactId>
			<version>${jackson.version}</version>
		</dependency>

		<!-- Log dependency -->
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>1.7.25</version>
		</dependency>
		<dependency>
			<groupId>ch.qos.logback</groupId>
			<artifactId>logback-classic</artifactId>
			<version>${logback.version}</version>
		</dependency>


		<!-- deeplearning4j-core: contains main functionality and neural networks -->
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-core</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.nd4j</groupId>
			<artifactId>nd4j-native</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-cuda-11.2</artifactId>
			<version>${dl4j.version}</version>
		</dependency>

		<dependency>
			<groupId>org.nd4j</groupId>
			<artifactId>nd4j-cuda-11.2-platform</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
		<!-- new directive -->
		<dependency>
			<groupId>org.nd4j</groupId>
			<artifactId>nd4j-cuda-11.2</artifactId>
			<version>${dl4j.version}</version>
			<classifier>linux-x86_64-cudnn</classifier>
		</dependency>

		<!-- ParallelWrapper & ParallelInference live here -->
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-parallel-wrapper</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.datavec</groupId>
			<artifactId>datavec-data-image</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.datavec</groupId>
			<artifactId>datavec-local</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-zoo</artifactId>
			<version>${dl4j.version}</version>
		</dependency>
	</dependencies>
</project>

@ajmakoni you can just remove deeplearning4j-cudnn from that. Can you show an output log of cudnn vs non cudnn? Use the PerformanceListener during the training.

Something to reproduce will help us understand if you have a real bottleneck here.

Anything that uses most of your GPU capacity will trigger frequent GCs which maybe part of the issue.
Could you also show your training loop here?

@agibsonccc hi, sorry for the long silence. Would you like me to send the test project to you? I did the test as suggested but keeps running into our of memory errors

@ajmakoni yes you can put it on github as a private or public repo and share it with my username on github (same as here) or you can coordinate with me over DM depending on how sensitive it is. Thanks!