//Defaulttokenizer is in English. If it is in Chinese, it should be implemented by using Chinese word segmentation, such as ansj and hanlp
DefaultTokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
// tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
TfidfVectorizer vectorizer = new TfidfVectorizer.Builder()
.setMinWordFrequency(1)
.setStopWords(new ArrayList<>())
.setTokenizerFactory(tokenizerFactory)
.setIterator(iter)
.build();