I have tried following the formulas for TFIDF calculation and cosineSimilarity calculation, and translated it into code. The results i get seems to be working how it should, but i am worried about having missed something, or if i have not done the TFIDF calculations or cosine calculations correctly. If someone could please review it, give some pointers or some critique, it would be a huge help
This is my class for calculations
package org.example.tfidf;
import java.util.*;
public class TFIDF {
private static List documents = new ArrayList<>();
private static List vocabularyList = new ArrayList<>();
private static Map tfIdfVectors = new HashMap<>();
private List splitIntoDocuments(List objects) {
List documents = new ArrayList<>();
for (T object : objects) {
documents.add(object.toString());
}
return documents;
}
private List terms(String document){
List terms = new ArrayList<>();
String[] termsplit = document.split("\\s+");
for (String term : termsplit) {
terms.add(term);
}
return terms;
}
private double tf(String term, List terms){
return (double) occurenceOfTerm(term, terms) / totalTerms(terms);
}
private int totalTerms(List terms){
return terms.size();
}
private int occurenceOfTerm(String term,List terms){
return Collections.frequency(terms, term);
}
private double idf(String term, List documents){
int n = documents.size();
int df = 0;
for (String document : documents) {
if (terms(document).contains(term)) {
df++;
}
}
if(df==0){
return 0;
}
return Math.log(Double.valueOf(n)/Double.valueOf(df));
}
public Map createTFIDFVectors(List objects){
documents = splitIntoDocuments(objects);
Set vocabulary = new HashSet<>();
for (String document : documents) {
vocabulary.addAll(terms(document));
}
vocabularyList = new ArrayList<>(vocabulary);
for(int docIndex = 0; docIndex < documents.size(); docIndex++){
String document = documents.get(docIndex);
List terms = terms(document);
double[] tfIdfVector = new double[vocabularyList.size()];
for(int termIndex = 0; termIndex < vocabularyList.size(); termIndex++){
String term = vocabularyList.get(termIndex);
double tf = tf(term, terms);
double idf = idf(term, documents);
tfIdfVector[termIndex] = tf * idf;
}
tfIdfVectors.put(docIndex, tfIdfVector);
}
return tfIdfVectors;
}
public double cosineSimilarity(double[] vectorA, double[] vectorB){
double dotProduct = 0.0;
double normA = 0.0;
double normB = 0.0;
for(int i = 0; i < vectorA.length; i++){
dotProduct += vectorA[i] * vectorB[i];
normA += Math.pow(vectorA[i], 2);
normB += Math.pow(vectorB[i], 2);
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
private double[] addToCorbus(String document){
List terms = terms(document);
double[] tfIdfVector = new double[vocabularyList.size()];
for(int termIndex = 0; termIndex < vocabularyList.size(); termIndex++){
String term = vocabularyList.get(termIndex);
double tf = tf(term, terms);
double idf = idf(term, documents);
tfIdfVector[termIndex] = tf * idf;
}
return tfIdfVector;
}
public List listMostSimilarDescend(String object){
double[] objectVector = addToCorbus(object);
Map cosineSimilarities = new HashMap<>();
tfIdfVectors.forEach((document, vector) -> {
double cosineSimilarity = cosineSimilarity(objectVector, vector);
cosineSimilarities.put(document, cosineSimilarity);
});
List mostSimilar = cosineSimilarities.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.map(Map.Entry::getKey)
.toList();
return mostSimilar;
}
}
This is an implementation so you can see the purpose
public static void main(String[] args) {
SpringApplication.run(TfidfApplication.class, args);
String doc1 = "The cat sat on the mat";
String doc2 = "The dog sat on the mat";
String doc3 = "dog and dogs are animals";
List documents = Arrays.asList(doc1, doc2, doc3);
TFIDF tfidf = new TFIDF<>();
Map tfIdfVectors = tfidf.createTFIDFVectors(documents);
System.out.println("TF-IDF Vectors:");
for(Map.Entry entry: tfIdfVectors.entrySet()){
System.out.println("Document "+entry.getKey());
double[] tfidfvector = entry.getValue();
for (double v : tfidfvector) {
System.out.print(v + " ");
}
System.out.println();
}
System.out.println("Cosine between doc0 and doc1: "+tfidf.cosineSimilarity(tfIdfVectors.get(0), tfIdfVectors.get(1)));
System.out.println("Cosine between doc1 and doc2: "+tfidf.cosineSimilarity(tfIdfVectors.get(1), tfIdfVectors.get(2)));
System.out.println("Most similar documents: "+tfidf.listMostSimilarDescend("cat sat the mat"));
}
Results of main function
TF-IDF Vectors:
Document 0 0.06757751801802739 0.06757751801802739 0.06757751801802739 0.0 0.0 0.1831020481113516 0.06757751801802739 0.0 0.0 0.0 0.06757751801802739
Document 1 0.06757751801802739 0.06757751801802739 0.06757751801802739 0.0 0.0 0.0 0.06757751801802739 0.0 0.0 0.06757751801802739 0.06757751801802739
Document 2 0.0 0.0 0.0 0.21972245773362198 0.21972245773362198 0.0 0.0 0.21972245773362198 0.21972245773362198 0.08109302162163289 0.0
Cosine between doc0 and doc1: 0.5810469954347838
Most similar documents first most similar and so on: [0, 1, 2]