• Home
  • Popular
  • Login
  • Signup
  • Cookie
  • Terms of Service
  • Privacy Policy
avatar

Posted by User Bot


30 Nov, 2024

Updated at 12 Dec, 2024

TF-IDF Implementation in Java

I have tried following the formulas for TFIDF calculation and cosineSimilarity calculation, and translated it into code. The results i get seems to be working how it should, but i am worried about having missed something, or if i have not done the TFIDF calculations or cosine calculations correctly. If someone could please review it, give some pointers or some critique, it would be a huge help

This is my class for calculations

package org.example.tfidf;

import java.util.*;

public class TFIDF {
    private static List documents = new ArrayList<>();
    private static List vocabularyList = new ArrayList<>();
    private static Map tfIdfVectors = new HashMap<>();

    private List splitIntoDocuments(List objects) {
        List documents = new ArrayList<>();
        for (T object : objects) {
            documents.add(object.toString());
        }
        return documents;
    }

   private List terms(String document){
        List terms = new ArrayList<>();
        String[] termsplit = document.split("\\s+");
        for (String term : termsplit) {
            terms.add(term);
        }
        return terms;
   }

   private double tf(String term, List terms){
        return (double) occurenceOfTerm(term, terms) / totalTerms(terms);
   }

   private int totalTerms(List terms){
        return terms.size();
   }

   private int occurenceOfTerm(String term,List terms){
        return Collections.frequency(terms, term);
   }

   private double idf(String term, List documents){
        int n = documents.size();
        int df = 0;
        for (String document : documents) {
            if (terms(document).contains(term)) {
                df++;
            }
        }
        if(df==0){
            return 0;
        }
        return Math.log(Double.valueOf(n)/Double.valueOf(df));
   }

   public Map createTFIDFVectors(List objects){
        documents = splitIntoDocuments(objects);

        Set vocabulary = new HashSet<>();
        for (String document : documents) {
            vocabulary.addAll(terms(document));
        }

        vocabularyList = new ArrayList<>(vocabulary);

        for(int docIndex = 0; docIndex < documents.size(); docIndex++){
            String document = documents.get(docIndex);
            List terms = terms(document);

            double[] tfIdfVector = new double[vocabularyList.size()];

            for(int termIndex = 0; termIndex < vocabularyList.size(); termIndex++){
                String term = vocabularyList.get(termIndex);
                double tf = tf(term, terms);
                double idf = idf(term, documents);
                tfIdfVector[termIndex] = tf * idf;
            }
            tfIdfVectors.put(docIndex, tfIdfVector);
        }
        return tfIdfVectors;
   }

   public double cosineSimilarity(double[] vectorA, double[] vectorB){
        double dotProduct = 0.0;
        double normA = 0.0;
        double normB = 0.0;

        for(int i = 0; i < vectorA.length; i++){
            dotProduct += vectorA[i] * vectorB[i];
            normA += Math.pow(vectorA[i], 2);
            normB += Math.pow(vectorB[i], 2);
        }

        return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
   }

   private double[] addToCorbus(String document){
        List terms = terms(document);
        double[] tfIdfVector = new double[vocabularyList.size()];

        for(int termIndex = 0; termIndex < vocabularyList.size(); termIndex++){
            String term = vocabularyList.get(termIndex);
            double tf = tf(term, terms);
            double idf = idf(term, documents);
            tfIdfVector[termIndex] = tf * idf;
        }
        return tfIdfVector;
   }

   public List listMostSimilarDescend(String object){
        double[] objectVector = addToCorbus(object);
        Map cosineSimilarities = new HashMap<>();

        tfIdfVectors.forEach((document, vector) -> {
            double cosineSimilarity = cosineSimilarity(objectVector, vector);
            cosineSimilarities.put(document, cosineSimilarity);
        });

        List mostSimilar = cosineSimilarities.entrySet().stream()
                .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
                .map(Map.Entry::getKey)
                .toList();

        return mostSimilar;
   }

}

This is an implementation so you can see the purpose

  public static void main(String[] args) {
        SpringApplication.run(TfidfApplication.class, args);
        String doc1 = "The cat sat on the mat";
        String doc2 = "The dog sat on the mat";
        String doc3 = "dog and dogs are animals";

        List documents = Arrays.asList(doc1, doc2, doc3);
        TFIDF tfidf = new TFIDF<>();

        Map tfIdfVectors = tfidf.createTFIDFVectors(documents);

        System.out.println("TF-IDF Vectors:");

        for(Map.Entry entry: tfIdfVectors.entrySet()){
            System.out.println("Document "+entry.getKey());
            double[] tfidfvector = entry.getValue();
            for (double v : tfidfvector) {
                System.out.print(v + " ");
            }
            System.out.println();
        }

        System.out.println("Cosine between doc0 and doc1: "+tfidf.cosineSimilarity(tfIdfVectors.get(0), tfIdfVectors.get(1)));
        System.out.println("Cosine between doc1 and doc2: "+tfidf.cosineSimilarity(tfIdfVectors.get(1), tfIdfVectors.get(2)));
        System.out.println("Most similar documents: "+tfidf.listMostSimilarDescend("cat sat the mat"));
    }

Results of main function

TF-IDF Vectors:

Document 0 0.06757751801802739 0.06757751801802739 0.06757751801802739 0.0 0.0 0.1831020481113516 0.06757751801802739 0.0 0.0 0.0 0.06757751801802739

Document 1 0.06757751801802739 0.06757751801802739 0.06757751801802739 0.0 0.0 0.0 0.06757751801802739 0.0 0.0 0.06757751801802739 0.06757751801802739

Document 2 0.0 0.0 0.0 0.21972245773362198 0.21972245773362198 0.0 0.0 0.21972245773362198 0.21972245773362198 0.08109302162163289 0.0

Cosine between doc0 and doc1: 0.5810469954347838

Most similar documents first most similar and so on: [0, 1, 2]