rebased and added scripts

WIP almost done, but realized we can just do basic retrieval
WIP
2026-02-16 23:35:46 +00:00 · 2025-04-04 16:53:30 -07:00 · 2025-04-04 16:52:33 -07:00 · 2025-04-04 16:52:33 -07:00
10 changed files with 558 additions and 62 deletions
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
                        selected_search_docs=selected_db_search_docs,
                        # Deduping happens at the last step to avoid harming quality by dropping content early on
                        # Skip deduping completely for ordering-only mode to save time
-                        dedupe_docs=(
-                            False
-                            if search_for_ordering_only
-                            else (
-                                retrieval_options.dedupe_docs
-                                if retrieval_options
-                                else False
-                            )
+                        dedupe_docs=bool(
+                            not search_for_ordering_only
+                            and retrieval_options
+                            and retrieval_options.dedupe_docs
                        ),
                        user_files=user_file_files if search_for_ordering_only else [],
                        loaded_user_files=user_files
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -163,6 +163,8 @@ INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE") or 16)

 MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4))

+USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true"
+
 # Below are intended to match the env variables names used by the official postgres docker image
 # https://hub.docker.com/_/postgres
 POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
--- a/backend/onyx/connectors/google_drive/appsscript.json
+++ b/backend/onyx/connectors/google_drive/appsscript.json
@@ -0,0 +1,17 @@
+{
+    "timeZone": "America/Los_Angeles",
+    "dependencies": {
+      "enabledAdvancedServices": [
+        {
+          "userSymbol": "Docs",
+          "version": "v1",
+          "serviceId": "docs"
+        }
+      ]
+    },
+    "exceptionLogging": "STACKDRIVER",
+    "runtimeVersion": "V8",
+    "executionApi": {
+      "access": "MYSELF"
+    }
+  }
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@@ -18,6 +18,7 @@ from typing_extensions import override
 from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.app_configs import MAX_DRIVE_WORKERS
+from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.connectors.exceptions import CredentialExpiredError
@@ -39,12 +40,16 @@ from onyx.connectors.google_drive.models import GoogleDriveFileType
 from onyx.connectors.google_drive.models import RetrievedDriveFile
 from onyx.connectors.google_drive.models import StageCompletion
 from onyx.connectors.google_utils.google_auth import get_google_creds
+from onyx.connectors.google_utils.google_utils import create_scripts_file_objects
 from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
 from onyx.connectors.google_utils.google_utils import GoogleFields
+from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME
 from onyx.connectors.google_utils.resources import get_admin_service
 from onyx.connectors.google_utils.resources import get_drive_service
 from onyx.connectors.google_utils.resources import get_google_docs_service
+from onyx.connectors.google_utils.resources import get_google_scripts_service
 from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.google_utils.shared_constants import (
    DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
 )
@@ -90,6 +95,7 @@ def _convert_single_file(
    creds: Any,
    allow_images: bool,
    size_threshold: int,
+    smart_chips_deployment_id: str,
    retriever_email: str,
    file: dict[str, Any],
 ) -> Document | ConnectorFailure | None:
@@ -107,10 +113,15 @@ def _convert_single_file(
    docs_service = lazy_eval(
        lambda: get_google_docs_service(creds, user_email=user_email)
    )
+    scripts_service = lazy_eval(
+        lambda: get_google_scripts_service(creds, user_email=user_email)
+    )
    return convert_drive_item_to_document(
        file=file,
        drive_service=user_drive_service,
        docs_service=docs_service,
+        scripts_service=scripts_service,
+        smart_chips_deployment_id=smart_chips_deployment_id,
        allow_images=allow_images,
        size_threshold=size_threshold,
    )
@@ -176,6 +187,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
        my_drive_emails: str | None = None,
        shared_folder_urls: str | None = None,
        batch_size: int = INDEX_BATCH_SIZE,
+        smart_chip_deployment_id: str = "",
        # OLD PARAMETERS
        folder_paths: list[str] | None = None,
        include_shared: bool | None = None,
@@ -248,6 +260,8 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
        self._retrieved_ids: set[str] = set()
        self.allow_images = False

+        self.smart_chip_deployment_id = smart_chip_deployment_id
+
        self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD

    def set_allow_images(self, value: bool) -> None:
@@ -295,8 +309,108 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
            source=DocumentSource.GOOGLE_DRIVE,
        )

+        if USE_SMART_CHIP_SCOPES:
+            self.upsert_smart_chip_app_script()
+
        return new_creds_dict

+    @staticmethod
+    def _get_latest_deployment(
+        scripts_service: GoogleScriptsService, script_id: str
+    ) -> dict[str, Any]:
+        deployments = (
+            scripts_service.projects()
+            .deployments()
+            .list(
+                scriptId=script_id,
+            )
+            .execute()
+        )
+        all_deployments = deployments.get("deployments", [])
+        while "nextPageToken" in deployments:
+            deployments = (
+                scripts_service.projects()
+                .deployments()
+                .list(
+                    scriptId=script_id,
+                    pageToken=deployments["nextPageToken"],
+                )
+                .execute()
+            )
+            all_deployments.extend(deployments.get("deployments", []))
+
+        if len(all_deployments) == 0:
+            raise RuntimeError(f"No deployments found for script {script_id}")
+        return max(
+            all_deployments,
+            key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(),
+        )
+
+    def upsert_smart_chip_app_script(self) -> None:
+        assert self._creds is not None, "creds not set"
+
+        # If a deployment id is provided, we don't need to create a new script.
+        # The deployment id can be retrieved by going under
+        # Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com)
+        if self.smart_chip_deployment_id:
+            return
+
+        # Step 1: Check if the script already exists by searching the admin drive.
+        drive_service = get_drive_service(
+            self._creds, user_email=self.primary_admin_email
+        )
+        q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false"
+        script_search = (
+            drive_service.files()
+            .list(
+                corpora="user",
+                fields="files(mimeType, id, name)",
+                q=q,
+            )
+            .execute()
+        )
+        script_id = (script_search.get("files") or [{}])[0].get("id")
+        scripts_service = get_google_scripts_service(
+            self._creds, user_email=self.primary_admin_email
+        )
+        if not script_id:
+            # Step 2: Create the script if nonexistent
+            # (Takes about ~10 seconds)
+            req = scripts_service.projects().create(
+                body={"title": SMART_CHIP_SCRIPT_FILE_NAME}
+            )
+            response = req.execute()
+
+            if "scriptId" not in response:
+                raise RuntimeError(
+                    f"Failed to create Smart Chip App Script: {response}"
+                )
+
+            script_id = response["scriptId"]
+            scripts_files = create_scripts_file_objects()
+            # Step 3: Update (upload) the script content
+            response = (
+                scripts_service.projects()
+                .updateContent(scriptId=script_id, body={"files": scripts_files})
+                .execute()
+            )
+
+            if "scriptId" not in response:
+                raise RuntimeError(
+                    f"Failed to update Smart Chip App Script: {response}"
+                )
+
+            script_id = response["scriptId"]
+
+        # Step 4: Get the deployment id
+        self.smart_chip_deployment_id = self._get_latest_deployment(
+            scripts_service, script_id
+        )["deploymentId"]
+
+        # TODO: upsert new version if out of date. We don't expect to do this often.
+        # One way would be to check whether the script files have changed (either via git
+        # or actually pulling the current content and comparing).
+
    def _update_traversed_parent_ids(self, folder_id: str) -> None:
        self._retrieved_ids.add(folder_id)

@@ -952,6 +1066,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
                self.creds,
                self.allow_images,
                self.size_threshold,
+                self.smart_chip_deployment_id,
            )
            # Fetch files in batches
            batches_complete = 0
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -14,6 +14,7 @@ from onyx.connectors.google_drive.models import GoogleDriveFileType
 from onyx.connectors.google_drive.section_extraction import get_document_sections
 from onyx.connectors.google_utils.resources import GoogleDocsService
 from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import DocumentFailure
@@ -224,6 +225,8 @@ def convert_drive_item_to_document(
    file: GoogleDriveFileType,
    drive_service: Callable[[], GoogleDriveService],
    docs_service: Callable[[], GoogleDocsService],
+    scripts_service: Callable[[], GoogleScriptsService],
+    smart_chips_deployment_id: str,
    allow_images: bool,
    size_threshold: int,
 ) -> Document | ConnectorFailure | None:
@@ -244,7 +247,10 @@ def convert_drive_item_to_document(
            try:
                # get_document_sections is the advanced approach for Google Docs
                doc_sections = get_document_sections(
-                    docs_service=docs_service(), doc_id=file.get("id", "")
+                    docs_service=docs_service(),
+                    scripts_service=scripts_service(),
+                    smart_chips_deployment_id=smart_chips_deployment_id,
+                    doc_id=file.get("id", ""),
                )
                if doc_sections:
                    sections = cast(list[TextSection | ImageSection], doc_sections)
--- a/backend/onyx/connectors/google_drive/section_extraction.py
+++ b/backend/onyx/connectors/google_drive/section_extraction.py
@@ -1,23 +1,29 @@
+from collections.abc import Callable
 from typing import Any

 from pydantic import BaseModel

+from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.connectors.google_utils.resources import GoogleDocsService
+from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.models import TextSection


+DRIVE_CHIP_CHAR = "\ue907"
+UNKNOWN_SMART_CHIP_STR = "<Smart Chip>"
+
+
 class CurrentHeading(BaseModel):
-    id: str
+    id: str | None
    text: str


-def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
+def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
    """Builds a Google Doc link that jumps to a specific heading"""
    # NOTE: doesn't support docs with multiple tabs atm, if we need that ask
    # @Chris
-    return (
-        f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
-    )
+    heading_str = f"#heading={heading_id}" if heading_id else ""
+    return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"


 def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
@@ -25,81 +31,230 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
    return paragraph["paragraphStyle"]["headingId"]


-def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
+def _extract_text_from_paragraph(
+    paragraph: dict[str, Any], extract_chip: Callable[[int], str | None]
+) -> tuple[str, int]:
    """Extracts the text content from a paragraph element"""
    text_elements = []
    for element in paragraph.get("elements", []):
        if "textRun" in element:
            text_elements.append(element["textRun"].get("content", ""))
-    return "".join(text_elements)
+
+        # Handle links
+        if "textStyle" in element and "link" in element["textStyle"]:
+            text_elements.append(f"({element['textStyle']['link'].get('url', '')})")
+
+        if "person" in element:
+            name = element["person"].get("personProperties", {}).get("name", "")
+            email = element["person"].get("personProperties", {}).get("email", "")
+            person_str = "<Person|"
+            if name:
+                person_str += f"name: {name}, "
+            if email:
+                person_str += f"email: {email}"
+            person_str += ">"
+            text_elements.append(person_str)
+
+        if "richLink" in element:
+            props = element["richLink"].get("richLinkProperties", {})
+            title = props.get("title", "")
+            uri = props.get("uri", "")
+            link_str = f"[{title}]({uri})"
+            text_elements.append(link_str)
+
+    ret = "".join(text_elements)
+
+    # add chip strings in place of each non-text
+    text_chunks = ret.split(DRIVE_CHIP_CHAR)
+    num_non_text_elements = len(text_chunks) - 1
+    for i in range(num_non_text_elements):
+        text_chunks[i] += extract_chip(i) or UNKNOWN_SMART_CHIP_STR
+    return "".join(text_chunks), num_non_text_elements
+
+
+def _extract_smart_chips_from_document(
+    document_id: str,
+    scripts_service: GoogleScriptsService,
+    deployment_id: str,
+) -> dict[str, str]:
+    """Extracts smart chips from a Google Doc. Returns a dictionary where
+    the keys are the smart chip location keys and the values are the smart chip text.
+
+    This calls a Google Apps Script function, because most smart chips are not currently
+    available through the API https://issuetracker.google.com/issues/225584757
+
+    Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum".
+    nonTextIndexNum refers to the index at which the value was found while traversing
+    the paragraph or table cell from left to right, top to bottom.
+
+    There are many non-text elements that are currently not supported by Apps Script, (see
+    https://developers.google.com/apps-script/reference/document/element-type ), so some
+    non-text elements won't have an associated text value.
+    """
+
+    # NOTE: the documentation is incorrect; the script id you must specify is
+    # actually the deployment id (what comes up when you go to Deploy-> Test Deployments)
+    http_request = scripts_service.scripts().run(
+        scriptId=deployment_id,
+        body={
+            "function": "docToChips",
+            "parameters": [document_id],
+            # "devMode": True
+        },
+    )
+    doc = http_request.execute()
+    return doc.get("response", {}).get("result", {})
+
+
+def _extract_text_from_table(
+    table: dict[str, Any], extract_chip: Callable[[int], str | None]
+) -> str:
+    """
+    Extracts the text content from a table element.
+    Smart chip extraction will be wrong for nested tables.
+    """
+    row_strs = []
+    seen_non_text = 0
+
+    def table_extract_chip(non_text_index: int) -> str | None:
+        return extract_chip(non_text_index + seen_non_text)
+
+    for row in table.get("tableRows", []):
+        cells = row.get("tableCells", [])
+        cell_strs = []
+        for cell in cells:
+            child_elements = cell.get("content", {})
+            cell_str = []
+            for child_elem in child_elements:
+                if "paragraph" not in child_elem:
+                    continue
+                text, num_non_text_elements = _extract_text_from_paragraph(
+                    child_elem["paragraph"], table_extract_chip
+                )
+                cell_str.append(text)
+                seen_non_text += num_non_text_elements
+
+            cell_strs.append("".join(cell_str))
+        row_strs.append(", ".join(cell_strs))
+    return "\n".join(row_strs)


 def get_document_sections(
    docs_service: GoogleDocsService,
+    scripts_service: GoogleScriptsService,
+    smart_chips_deployment_id: str,
    doc_id: str,
 ) -> list[TextSection]:
    """Extracts sections from a Google Doc, including their headings and content"""
    # Fetch the document structure
-    doc = docs_service.documents().get(documentId=doc_id).execute()
+    http_request = docs_service.documents().get(documentId=doc_id)
+
+    # Google has poor support for tabs in the docs api, see
+    # https://cloud.google.com/python/docs/reference/cloudtasks/
+    # latest/google.cloud.tasks_v2.types.HttpRequest
+    # https://developers.google.com/workspace/docs/api/how-tos/tabs
+    # https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
+    # this is a hack to use the param mentioned in the rest api docs
+    # TODO: check if it can be specified i.e. in documents()
+    http_request.uri += "&includeTabsContent=true"
+    doc = http_request.execute()
+
+    smart_chips = {}
+    if USE_SMART_CHIP_SCOPES:
+        # Get the smart chips
+        smart_chips = _extract_smart_chips_from_document(
+            doc_id, scripts_service, smart_chips_deployment_id
+        )

    # Get the content
-    content = doc.get("body", {}).get("content", [])
+    tabs = doc.get("tabs", {})
+    sections: list[TextSection] = []
+    for tab_num, tab in enumerate(tabs):
+        sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips))
+    return sections
+
+
+def _is_heading(paragraph: dict[str, Any]) -> bool:
+    """Checks if a paragraph (a block of text in a drive document) is a heading"""
+    if not (
+        "paragraphStyle" in paragraph
+        and "namedStyleType" in paragraph["paragraphStyle"]
+    ):
+        return False
+
+    style = paragraph["paragraphStyle"]["namedStyleType"]
+    is_heading = style.startswith("HEADING_")
+    is_title = style.startswith("TITLE")
+    return is_heading or is_title
+
+
+def _add_finished_section(
+    sections: list[TextSection],
+    doc_id: str,
+    tab_id: str,
+    current_heading: CurrentHeading,
+    current_section: list[str],
+) -> None:
+    """Adds a finished section to the list of sections if the section has content.
+    Returns the list of sections to use going forward, which may be the old list
+    if a new section was not added.
+    """
+    if not (current_section or current_heading.text):
+        return
+    # If we were building a previous section, add it to sections list
+    section_text = f"{current_heading.text}\n" + "\n".join(current_section)
+    sections.append(
+        TextSection(
+            text=section_text.strip(),
+            link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
+        )
+    )
+
+
+def get_tab_sections(
+    tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str]
+) -> list[TextSection]:
+    tab_id = tab["tabProperties"]["tabId"]
+    content = tab.get("documentTab", {}).get("body", {}).get("content", [])

    sections: list[TextSection] = []
    current_section: list[str] = []
-    current_heading: CurrentHeading | None = None
+    current_heading = CurrentHeading(id=None, text="")

-    for element in content:
-        if "paragraph" not in element:
-            continue
+    for element_num, element in enumerate(content):

-        paragraph = element["paragraph"]
+        def extract_chip(non_text_index: int) -> str | None:
+            return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}")

-        # Check if this is a heading
-        if (
-            "paragraphStyle" in paragraph
-            and "namedStyleType" in paragraph["paragraphStyle"]
-        ):
-            style = paragraph["paragraphStyle"]["namedStyleType"]
-            is_heading = style.startswith("HEADING_")
-            is_title = style.startswith("TITLE")
+        if "paragraph" in element:
+            paragraph = element["paragraph"]

-            if is_heading or is_title:
-                # If we were building a previous section, add it to sections list
-                if current_heading is not None and current_section:
-                    heading_text = current_heading.text
-                    section_text = f"{heading_text}\n" + "\n".join(current_section)
-                    sections.append(
-                        TextSection(
-                            text=section_text.strip(),
-                            link=_build_gdoc_section_link(doc_id, current_heading.id),
-                        )
-                    )
-                    current_section = []
-
-                # Start new heading
-                heading_id = _extract_id_from_heading(paragraph)
-                heading_text = _extract_text_from_paragraph(paragraph)
-                current_heading = CurrentHeading(
-                    id=heading_id,
-                    text=heading_text,
-                )
+            # If this is not a heading, add content to current section
+            if not _is_heading(paragraph):
+                text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
+                if text.strip():
+                    current_section.append(text)
                continue

-        # Add content to current section
-        if current_heading is not None:
-            text = _extract_text_from_paragraph(paragraph)
+            _add_finished_section(
+                sections, doc_id, tab_id, current_heading, current_section
+            )
+
+            current_section = []
+
+            # Start new heading
+            heading_id = _extract_id_from_heading(paragraph)
+            heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
+            current_heading = CurrentHeading(
+                id=heading_id,
+                text=heading_text,
+            )
+        elif "table" in element:
+            text = _extract_text_from_table(element["table"], extract_chip)
            if text.strip():
                current_section.append(text)

    # Don't forget to add the last section
-    if current_heading is not None and current_section:
-        section_text = f"{current_heading.text}\n" + "\n".join(current_section)
-        sections.append(
-            TextSection(
-                text=section_text.strip(),
-                link=_build_gdoc_section_link(doc_id, current_heading.id),
-            )
-        )
+    _add_finished_section(sections, doc_id, tab_id, current_heading, current_section)

    return sections
--- a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
+++ b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
@@ -0,0 +1,132 @@
+
+/**
+ * Retrieves the given Google doc by id and extracts dates, people, and rich links
+ * from it into a JSON keyed by tab, paragraph, and non-text-component index.
+ * 
+ */
+function docToChips(document_id) {
+  const doc = DocumentApp.openById(document_id);
+  const tabs = doc.getTabs();
+  const ret = new Map();
+  tabs.map((tab, tabInd) => {
+    const docTab = tab.asDocumentTab();
+    const body = docTab.getBody();
+    for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) {
+      var tabChild = body.getChild(tabChildInd);
+      var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;});
+      switch (tabChild.getType()) {
+        case DocumentApp.ElementType.PARAGRAPH:
+          parseParagraph(tabChild.asParagraph(), callback);
+          console.log("paragraph", tabChild.asParagraph().getText());
+          break;
+        case DocumentApp.ElementType.TABLE:
+          console.log("table");
+          parseTable(tabChild.asTable(), callback);
+          break;
+        case DocumentApp.ElementType.LIST_ITEM:
+          var listItem = tabChild.asListItem();
+          //console.log("list item:", listItem.getText(), listItem.getNumChildren());
+          //console.log(listItem.getChild(0).asText().getText());
+          parseParagraph(tabChild.asListItem(), callback);
+          break;
+        default:
+          console.log("found unknown tab body child of type: ", tabChild.getType().toString());
+      }
+    }
+  });
+  console.log(ret);
+  return ret;
+}
+
+// uncomment and paste in a file id (and change the main function to "test")
+// to test the docToChips function
+// function test() {
+//   return docToChips("document id goes here");
+// }
+
+function getKey(tabInd, paragraphInd, nonTextInd) {
+  return tabInd + "_" + paragraphInd + "_" + nonTextInd;
+}
+
+// also used for list items
+function parseParagraph(paragraph, callback) {
+  var nonTextInd = 0;
+  for (let i = 0; i < paragraph.getNumChildren(); i++) { //
+    var child = paragraph.getChild(i);
+    switch (child.getType()) {
+      case DocumentApp.ElementType.DATE:
+        console.log(child.asDate().getDisplayText());
+        callback(nonTextInd, child.asDate().getDisplayText());
+        break;
+      case DocumentApp.ElementType.EQUATION:
+        var eqStr = child.getText();
+        console.log("equation: ", eqStr);
+        callback(nonTextInd, eqStr);
+        break;
+      case DocumentApp.ElementType.PERSON:
+        var personStr = "<name: " + child.asPerson().getName() + ", email: "+ child.asPerson().getEmail() + ">";
+        console.log(personStr);
+        //callback(nonTextInd, personStr);
+        nonTextInd--; // Advanced Docs API picks up people
+        break;
+      case DocumentApp.ElementType.RICH_LINK:
+        var richLink = child.asRichLink()
+        var linkStr = "<title: " + richLink.getTitle() + ", type:" + richLink.getMimeType() + ">"
+        console.log(linkStr);
+        // callback(nonTextInd, child.asRichLink().getUrl());
+        nonTextInd--; // Advanced Docs API picks up rich links
+        break;
+      case DocumentApp.ElementType.TEXT:
+        console.log("text: "+ child.asText().getText());
+        //console.log(child.asText().)
+        nonTextInd--;
+        break;
+      case DocumentApp.ElementType.UNSUPPORTED:
+        console.log("unsupported element type");
+        break;
+      default:
+        console.log("found special element type:", child.getType().toString());
+    }
+    nonTextInd++;
+  }
+}
+
+function parseTable(table, callback) {
+  var lastSeenInCell = 0;
+  var allSeenElems = 0
+  const tableCallback = ((nonTextInd, replaceText) => {
+    callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText);
+    lastSeenInCell++;
+  });
+  for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) {
+    var row = table.getChild(rowInd);
+    if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) {
+      console.log("table child type: ", row.getType().toString());
+      continue;
+    }
+
+    for (let colInd = 0; colInd < row.getNumChildren(); colInd++) {
+      var cell = row.getChild(colInd);
+      if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) {
+        console.log("row child type: ", cell.getType().toString());
+        continue;
+      }
+
+      for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) {
+        var item = cell.getChild(itemInd);
+        console.log(item.getType().toString());
+        switch (item.getType()) {
+          case DocumentApp.ElementType.PARAGRAPH:
+          case DocumentApp.ElementType.LIST_ITEM:
+            parseParagraph(item, tableCallback);
+            break;
+          case DocumentApp.ElementType.TABLE:
+            parseTable(item, tableCallback);
+            break;
+        }
+      }
+      allSeenElems += lastSeenInCell;
+      lastSeenInCell = 0;
+    }
+  }
+}
--- a/backend/onyx/connectors/google_utils/google_utils.py
+++ b/backend/onyx/connectors/google_utils/google_utils.py
@@ -1,3 +1,4 @@
+import json
 import re
 import time
 from collections.abc import Callable
@@ -141,3 +142,50 @@ def execute_paginated_retrieval(
                yield item
        else:
            yield results
+
+
+# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType
+class AppsScriptFileType(str, Enum):
+    UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED"
+    SERVER_JS = "SERVER_JS"
+    HTML = "HTML"
+    JSON = "JSON"
+
+
+SMART_CHIP_RETRIEVAL_FUNCTIONS = [
+    ("docToChips", ["document_id"]),
+    ("getKey", ["tabInd", "paragraphInd", "nonTextInd"]),
+    ("parseParagraph", ["paragraph", "callback"]),
+    ("parseTable", ["table", "callback"]),
+]
+
+SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor"
+
+
+# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent
+def create_scripts_file_objects() -> list[GoogleDriveFileType]:
+    with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f:
+        script_source = f.read()
+    with open("onyx/connectors/google_drive/appsscript.json", "r") as f:
+        appsscript_source = json.loads(f.read())
+    return [
+        {
+            "name": "appsscript",
+            "type": AppsScriptFileType.JSON.value,
+            "source": json.dumps(appsscript_source),
+        },
+        {
+            "name": SMART_CHIP_SCRIPT_FILE_NAME,
+            "type": AppsScriptFileType.SERVER_JS.value,
+            "source": script_source,
+            "functionSet": {
+                "values": [
+                    {
+                        "name": name,
+                        "parameters": params,
+                    }
+                    for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS
+                ],
+            },
+        },
+    ]
--- a/backend/onyx/connectors/google_utils/resources.py
+++ b/backend/onyx/connectors/google_utils/resources.py
@@ -12,6 +12,10 @@ class GoogleDocsService(Resource):
    pass


+class GoogleScriptsService(Resource):
+    pass
+
+
 class AdminService(Resource):
    pass

@@ -62,3 +66,10 @@ def get_gmail_service(
    user_email: str | None = None,
 ) -> GmailService:
    return _get_google_service("gmail", "v1", creds, user_email)
+
+
+def get_google_scripts_service(
+    creds: ServiceAccountCredentials | OAuthCredentials,
+    user_email: str | None = None,
+) -> GoogleScriptsService:
+    return _get_google_service("script", "v1", creds, user_email)
--- a/backend/onyx/connectors/google_utils/shared_constants.py
+++ b/backend/onyx/connectors/google_utils/shared_constants.py
@@ -1,5 +1,6 @@
 from enum import Enum as PyEnum

+from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.configs.constants import DocumentSource

 # NOTE: do not need https://www.googleapis.com/auth/documents.readonly
@@ -18,6 +19,19 @@ GOOGLE_SCOPES = {
    ],
 }

+# TODO: add this to the docs
+GOOGLE_SMART_CHIP_SCOPES = [
+    "https://www.googleapis.com/auth/script.external_request",
+    "https://www.googleapis.com/auth/drive.scripts",
+    "https://www.googleapis.com/auth/script.scriptapp",
+    "https://www.googleapis.com/auth/script.deployments",
+    "https://www.googleapis.com/auth/script.projects",
+    "https://www.googleapis.com/auth/documents",
+]
+
+if USE_SMART_CHIP_SCOPES:
+    GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES
+
 # This is the Oauth token
 DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
 # This is the service account key
Author	SHA1	Message	Date
Evan Lohn	81e3975a09	rebased and added scripts	2025-04-04 16:53:30 -07:00
Evan Lohn	0781709a56	WIP almost done, but realized we can just do basic retrieval	2025-04-04 16:52:33 -07:00
Evan Lohn	93529b081c	WIP	2025-04-04 16:52:33 -07:00