fix(llm): special tokens and leading space (#1831)

2024-04-04 14:37:29 +02:00 · 2024-04-04 14:37:29 +02:00 · 347be643f7
parent 08c4ab175e
commit 347be643f7
1 changed files with 8 additions and 1 deletions
--- a/private_gpt/components/llm/custom/sagemaker.py
+++ b/private_gpt/components/llm/custom/sagemaker.py
@ -243,12 +243,19 @@ class SagemakerLLM(CustomLLM):
            event_stream = resp["Body"]
            start_json = b"{"
            stop_token = "<|endoftext|>"
+            first_token = True

            for line in LineIterator(event_stream):
                if line != b"" and start_json in line:
                    data = json.loads(line[line.find(start_json) :].decode("utf-8"))
-                    if data["token"]["text"] != stop_token:
+                    special = data["token"]["special"]
+                    stop = data["token"]["text"] == stop_token
+                    if not special and not stop:
                        delta = data["token"]["text"]
+                        # trim the leading space for the first token if present
+                        if first_token:
+                            delta = delta.lstrip()
+                            first_token = False
                        text += delta
                        yield CompletionResponse(delta=delta, text=text, raw=data)