Semantics - as to whether we have 1 feature or 60 features
An error still occurs concerning the features .csv files which are row vectors instead of column vectors
The following is my current code (excluding imports) :-
@SuppressWarnings(âResultOfMethodCallIgnoredâ)
public class UCI600ClassificationExample {
private static final Logger log = LoggerFactory.getLogger(UCISequenceClassificationExample.class);
//'baseDir': Base directory for the data. Change this if you want to save the data somewhere else
private static File baseDir = new File("src/main/resources/forex/");
private static File baseTrainDir = new File(baseDir, "train");
private static File featuresDirTrain = new File(baseTrainDir, "features");
private static File labelsDirTrain = new File(baseTrainDir, "labels");
private static File baseTestDir = new File(baseDir, "test");
private static File featuresDirTest = new File(baseTestDir, "features");
private static File labelsDirTest = new File(baseTestDir, "labels");
public static void main(String[] args) throws Exception {
downloadUCIData();
// ----- Load the training data -----
//Note that we have 450 training files for features: train/features/0.csv through train/features/449.csv
SequenceRecordReader trainFeatures = new CSVSequenceRecordReader();
trainFeatures.initialize(new NumberedFileInputSplit(featuresDirTrain.getAbsolutePath() + "/%d.csv", 0, 449));
SequenceRecordReader trainLabels = new CSVSequenceRecordReader();
trainLabels.initialize(new NumberedFileInputSplit(labelsDirTrain.getAbsolutePath() + "/%d.csv", 0, 449));
int miniBatchSize = 10;
int numLabelClasses = 6;
DataSetIterator trainData = new SequenceRecordReaderDataSetIterator(trainFeatures, trainLabels, miniBatchSize, numLabelClasses,
false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
//Normalize the training data
DataNormalization normalizer = new NormalizerStandardize();
normalizer.fit(trainData); //Collect training data statistics
trainData.reset();
//Use previously collected statistics to normalize on-the-fly. Each DataSet returned by 'trainData' iterator will be normalized
trainData.setPreProcessor(normalizer);
// ----- Load the test data -----
//Same process as for the training data.
SequenceRecordReader testFeatures = new CSVSequenceRecordReader();
testFeatures.initialize(new NumberedFileInputSplit(featuresDirTest.getAbsolutePath() + "/%d.csv", 0, 149));
SequenceRecordReader testLabels = new CSVSequenceRecordReader();
testLabels.initialize(new NumberedFileInputSplit(labelsDirTest.getAbsolutePath() + "/%d.csv", 0, 149));
DataSetIterator testData = new SequenceRecordReaderDataSetIterator(testFeatures, testLabels, miniBatchSize, numLabelClasses,
false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
testData.setPreProcessor(normalizer); //Note that we are using the exact same normalization process as the training data
// ----- Configure the network -----
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
.seed(123) //Random number generator seed for improved repeatability. Optional.
.weightInit(WeightInit.XAVIER)
.updater(new Nadam())
.gradientNormalization(GradientNormalization.ClipElementWiseAbsoluteValue) //Not always required, but helps with this data set
.gradientNormalizationThreshold(0.5)
.list()
.layer(new LSTM.Builder().activation(Activation.TANH).nIn(1).nOut(10).build())
.layer(new LSTM.Builder().activation(Activation.TANH).nIn(10).nOut(10).build())
.layer(new LSTM.Builder().activation(Activation.TANH).nIn(10).nOut(10).build())
.layer(new RnnOutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
.activation(Activation.SOFTMAX).nIn(10).nOut(numLabelClasses).build())
.build();
MultiLayerNetwork net = new MultiLayerNetwork(conf);
net.init();
log.info("Starting training...");
net.setListeners(new ScoreIterationListener(20), new EvaluativeListener(testData, 1, InvocationType.EPOCH_END)); //Print the score (loss function value) every 20 iterations
int nEpochs = 100;
net.fit(trainData, nEpochs);
log.info("Evaluating...");
Evaluation eval = net.evaluate(testData);
log.info(eval.stats());
log.info("----- Example Complete -----");
}
//This method loads the data, and converts the "one time series per line" format into a suitable
//CSV sequence format that DataVec (CsvSequenceRecordReader) and DL4J can read.
private static void downloadUCIData() throws Exception {
if (baseDir.exists()) return; //Data already exists, don't download it again
// synthetic data
String url1 = "file:///C:/Synthetic Control/data2.data";
String data = IOUtils.toString(new URL(url1), (Charset) null);
String[] lines1 = data.split("\n");
// synthetic categories
String url2 = "file:///C:/Synthetic Control/cats2.csv";
String cats = IOUtils.toString(new URL(url2), (Charset) null);
String[] lines2 = cats.split("\n");
//Create directories
baseDir.mkdir();
baseTrainDir.mkdir();
featuresDirTrain.mkdir();
labelsDirTrain.mkdir();
baseTestDir.mkdir();
featuresDirTest.mkdir();
labelsDirTest.mkdir();
int lineCount = 0;
List<Pair<String, String>> DataAndCats = new ArrayList<>();
for (String line : lines2) {
String transposed = line.replaceAll(" +", "\n");
DataAndCats.add(new Pair<>(transposed, lines2[lineCount]));
lineCount++;
}
//Do a train/test split:
int nTrain = 450; //70% train, 30% test
int trainCount = 0;
int testCount = 0;
for (Pair<String, String> p : DataAndCats) {
//Write output in a format we can read, in the appropriate locations
File outPathFeatures;
File outPathLabels;
if (trainCount < nTrain) {
outPathFeatures = new File(featuresDirTrain, trainCount + ".csv");
outPathLabels = new File(labelsDirTrain, trainCount + ".csv");
trainCount++;
} else {
outPathFeatures = new File(featuresDirTest, testCount + ".csv");
outPathLabels = new File(labelsDirTest, testCount + ".csv");
testCount++;
}
String trans = String.join("\n",p.getFirst().split(" "));
FileUtils.writeStringToFile(outPathFeatures, trans, (Charset) null);
FileUtils.writeStringToFile(outPathLabels, p.getSecond().toString(), (Charset) null);
}
}
}