paperless-ngx/src/paperless/ai/llms.py

import json

import httpx
from llama_index.core.base.llms.types import ChatMessage
from llama_index.core.base.llms.types import ChatResponse
from llama_index.core.base.llms.types import ChatResponseGen
from llama_index.core.base.llms.types import CompletionResponse
from llama_index.core.base.llms.types import CompletionResponseGen
from llama_index.core.base.llms.types import LLMMetadata
from llama_index.core.llms.llm import LLM
from llama_index.core.prompts import SelectorPromptTemplate
from pydantic import Field


class OllamaLLM(LLM):
    model: str = Field(default="llama3")
    base_url: str = Field(default="http://localhost:11434")

    @property
    def metadata(self) -> LLMMetadata:
        return LLMMetadata(
            model_name=self.model,
            is_chat_model=False,
            context_window=4096,
            num_output=512,
            is_function_calling_model=False,
        )

    def complete(self, prompt: str, **kwargs) -> CompletionResponse:
        with httpx.Client(timeout=120.0) as client:
            response = client.post(
                f"{self.base_url}/api/generate",
                json={
                    "model": self.model,
                    "prompt": prompt,
                    "stream": False,
                },
            )
            response.raise_for_status()
            data = response.json()
            return CompletionResponse(text=data["response"])

    def stream(self, prompt: str, **kwargs) -> CompletionResponseGen:
        return self.stream_complete(prompt, **kwargs)

    def stream_complete(
        self,
        prompt: SelectorPromptTemplate,
        **kwargs,
    ) -> CompletionResponseGen:
        headers = {"Content-Type": "application/json"}
        data = {
            "model": self.model,
            "prompt": prompt.format(llm=self),
            "stream": True,
        }

        with httpx.stream(
            "POST",
            f"{self.base_url}/api/generate",
            headers=headers,
            json=data,
            timeout=60.0,
        ) as response:
            response.raise_for_status()
            for line in response.iter_lines():
                if not line.strip():
                    continue
                chunk = json.loads(line)
                if "response" in chunk:
                    yield CompletionResponse(text=chunk["response"])

    def chat(
        self,
        messages: list[ChatMessage],
        **kwargs,
    ) -> ChatResponse:  # pragma: no cover
        raise NotImplementedError("chat not supported")

    def stream_chat(
        self,
        messages: list[ChatMessage],
        **kwargs,
    ) -> ChatResponseGen:  # pragma: no cover
        raise NotImplementedError("stream_chat not supported")

    async def achat(
        self,
        messages: list[ChatMessage],
        **kwargs,
    ) -> ChatResponse:  # pragma: no cover
        raise NotImplementedError("async chat not supported")

    async def astream_chat(
        self,
        messages: list[ChatMessage],
        **kwargs,
    ) -> ChatResponseGen:  # pragma: no cover
        raise NotImplementedError("async stream_chat not supported")

    async def acomplete(
        self,
        prompt: str,
        **kwargs,
    ) -> CompletionResponse:  # pragma: no cover
        raise NotImplementedError("async complete not supported")

    async def astream_complete(
        self,
        prompt: str,
        **kwargs,
    ) -> CompletionResponseGen:  # pragma: no cover
        raise NotImplementedError("async stream_complete not supported")
Backend streaming chat 2025-04-25 10:06:26 -07:00			`import json`

Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`import httpx`
			`from llama_index.core.base.llms.types import ChatMessage`
			`from llama_index.core.base.llms.types import ChatResponse`
			`from llama_index.core.base.llms.types import ChatResponseGen`
			`from llama_index.core.base.llms.types import CompletionResponse`
			`from llama_index.core.base.llms.types import CompletionResponseGen`
			`from llama_index.core.base.llms.types import LLMMetadata`
			`from llama_index.core.llms.llm import LLM`
Backend streaming chat 2025-04-25 10:06:26 -07:00			`from llama_index.core.prompts import SelectorPromptTemplate`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`from pydantic import Field`


			`class OllamaLLM(LLM):`
			`model: str = Field(default="llama3")`
			`base_url: str = Field(default="http://localhost:11434")`

			`@property`
			`def metadata(self) -> LLMMetadata:`
			`return LLMMetadata(`
			`model_name=self.model,`
			`is_chat_model=False,`
			`context_window=4096,`
			`num_output=512,`
			`is_function_calling_model=False,`
			`)`

			`def complete(self, prompt: str, **kwargs) -> CompletionResponse:`
			`with httpx.Client(timeout=120.0) as client:`
			`response = client.post(`
			`f"{self.base_url}/api/generate",`
			`json={`
			`"model": self.model,`
			`"prompt": prompt,`
			`"stream": False,`
			`},`
			`)`
			`response.raise_for_status()`
			`data = response.json()`
			`return CompletionResponse(text=data["response"])`

Backend streaming chat 2025-04-25 10:06:26 -07:00			`def stream(self, prompt: str, **kwargs) -> CompletionResponseGen:`
			`return self.stream_complete(prompt, **kwargs)`
Fixup some tests 2025-04-25 00:59:46 -07:00
			`def stream_complete(`
			`self,`
Backend streaming chat 2025-04-25 10:06:26 -07:00			`prompt: SelectorPromptTemplate,`
			`**kwargs,`
			`) -> CompletionResponseGen:`
			`headers = {"Content-Type": "application/json"}`
			`data = {`
			`"model": self.model,`
			`"prompt": prompt.format(llm=self),`
			`"stream": True,`
			`}`

			`with httpx.stream(`
			`"POST",`
			`f"{self.base_url}/api/generate",`
			`headers=headers,`
			`json=data,`
			`timeout=60.0,`
			`) as response:`
			`response.raise_for_status()`
			`for line in response.iter_lines():`
			`if not line.strip():`
			`continue`
			`chunk = json.loads(line)`
			`if "response" in chunk:`
			`yield CompletionResponse(text=chunk["response"])`

			`def chat(`
			`self,`
			`messages: list[ChatMessage],`
Fixup some tests 2025-04-25 00:59:46 -07:00			`**kwargs,`
Backend streaming chat 2025-04-25 10:06:26 -07:00			`) -> ChatResponse: # pragma: no cover`
			`raise NotImplementedError("chat not supported")`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00
Fixup some tests 2025-04-25 00:59:46 -07:00			`def stream_chat(`
			`self,`
			`messages: list[ChatMessage],`
			`**kwargs,`
			`) -> ChatResponseGen: # pragma: no cover`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`raise NotImplementedError("stream_chat not supported")`

Fixup some tests 2025-04-25 00:59:46 -07:00			`async def achat(`
			`self,`
			`messages: list[ChatMessage],`
			`**kwargs,`
			`) -> ChatResponse: # pragma: no cover`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`raise NotImplementedError("async chat not supported")`

			`async def astream_chat(`
			`self,`
			`messages: list[ChatMessage],`
			`**kwargs,`
Fixup some tests 2025-04-25 00:59:46 -07:00			`) -> ChatResponseGen: # pragma: no cover`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`raise NotImplementedError("async stream_chat not supported")`

Fixup some tests 2025-04-25 00:59:46 -07:00			`async def acomplete(`
			`self,`
			`prompt: str,`
			`**kwargs,`
			`) -> CompletionResponse: # pragma: no cover`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`raise NotImplementedError("async complete not supported")`

Fixup some tests 2025-04-25 00:59:46 -07:00			`async def astream_complete(`
			`self,`
			`prompt: str,`
			`**kwargs,`
			`) -> CompletionResponseGen: # pragma: no cover`
Better encapsulate backends, use llama_index OpenAI 2025-04-24 23:20:27 -07:00			`raise NotImplementedError("async stream_complete not supported")`