Passage Jina -> intfloat/multilingual-e5-large

Jina me donne toujours RuntimeError: The size of tensor a (5) must match the size of tensor b (4) at non-singleton dimension 1
This commit is contained in:
2026-02-06 17:38:27 +01:00
parent 14b8664106
commit 1c2f0728ea
2 changed files with 13 additions and 35 deletions

View File

@@ -3,43 +3,21 @@ from langchain_chroma import Chroma # TODO plus tard, ramplacer par PG Vector
import sys import sys
from pathlib import Path from pathlib import Path
# Permet de garder ChromaDB en mémoire.
# Cette classe est un Singleton, il n'y en aura qu'une seule et unique instance à tout moment
# https://refactoring.guru/design-patterns/singleton
class VectorDatabase:
instance = None
def __new__(cls): # Selon https://www.geeksforgeeks.org/python/singleton-pattern-in-python-a-complete-guide/
if cls.instance is None:
cls.instance = super().__new__(cls)
# J'initialise les attributs à None ici, permet de tester si la classe a déjà été init une première fois ou non
cls.instance.__embeddings = None
cls.instance.__chroma = None
return cls.instance
def __init__(self):
if self.__embeddings is not None: return
base_dir:str = Path(sys.argv[0]).resolve().parent.as_posix() # Récupérer le chemin vers le point d'entrée du programme base_dir:str = Path(sys.argv[0]).resolve().parent.as_posix() # Récupérer le chemin vers le point d'entrée du programme
bdd_path:str = base_dir + "/chroma_db/" bdd_path:str = base_dir + "/../chroma_db/"
self.__embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3", model_kwargs={"trust_remote_code": True}) EMBEDDINGS = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={"trust_remote_code": True})
self.__chroma = Chroma( CHROMA = Chroma(
persist_directory=bdd_path, persist_directory=bdd_path,
embedding_function=self.__embeddings embedding_function=EMBEDDINGS
) )
def getChroma(self)->Chroma: class VectorDatabase: # Classe pour récupérer la BDD
return self.__chroma
def getEmbeddings(self)->'Embeddings Hugging Face': @staticmethod
return self.__embeddings def getChroma()->Chroma:
return CHROMA
if __name__ == "__main__": @staticmethod
def getEmbeddings()->'Embeddings Hugging Face':
test1 = VectorDatabase() return EMBEDDINGS
print('TEST 1 INIT')
test2 = VectorDatabase()
print(test1 is test2)
assert test1 is test2

View File

@@ -43,7 +43,7 @@ print("===")
# Création du modèle d'embeddings # Création du modèle d'embeddings
# https://docs.langchain.com/oss/python/integrations/text_embedding/huggingfacehub # https://docs.langchain.com/oss/python/integrations/text_embedding/huggingfacehub
# https://huggingface.co/jinaai/jina-clip-v2 # https://huggingface.co/jinaai/jina-clip-v2
embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3", model_kwargs={"trust_remote_code": True}) embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={"trust_remote_code": True})
# Stockage des embeddings dans ChromaDB dans un dossier local "chroma_db" # Stockage des embeddings dans ChromaDB dans un dossier local "chroma_db"
vectorstore = Chroma.from_documents(documents=chunks,embedding=embeddings, persist_directory=base_dir.as_posix()+"/chroma_db/",) # https://docs.langchain.com/oss/python/integrations/vectorstores/chroma vectorstore = Chroma.from_documents(documents=chunks,embedding=embeddings, persist_directory=base_dir.as_posix()+"/chroma_db/",) # https://docs.langchain.com/oss/python/integrations/vectorstores/chroma