mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-16 23:35:46 +00:00
Compare commits
3 Commits
nightly-la
...
drive-pill
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
81e3975a09 | ||
|
|
0781709a56 | ||
|
|
93529b081c |
@@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
|
||||
selected_search_docs=selected_db_search_docs,
|
||||
# Deduping happens at the last step to avoid harming quality by dropping content early on
|
||||
# Skip deduping completely for ordering-only mode to save time
|
||||
dedupe_docs=(
|
||||
False
|
||||
if search_for_ordering_only
|
||||
else (
|
||||
retrieval_options.dedupe_docs
|
||||
if retrieval_options
|
||||
else False
|
||||
)
|
||||
dedupe_docs=bool(
|
||||
not search_for_ordering_only
|
||||
and retrieval_options
|
||||
and retrieval_options.dedupe_docs
|
||||
),
|
||||
user_files=user_file_files if search_for_ordering_only else [],
|
||||
loaded_user_files=user_files
|
||||
|
||||
@@ -163,6 +163,8 @@ INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE") or 16)
|
||||
|
||||
MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4))
|
||||
|
||||
USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true"
|
||||
|
||||
# Below are intended to match the env variables names used by the official postgres docker image
|
||||
# https://hub.docker.com/_/postgres
|
||||
POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
|
||||
|
||||
17
backend/onyx/connectors/google_drive/appsscript.json
Normal file
17
backend/onyx/connectors/google_drive/appsscript.json
Normal file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"timeZone": "America/Los_Angeles",
|
||||
"dependencies": {
|
||||
"enabledAdvancedServices": [
|
||||
{
|
||||
"userSymbol": "Docs",
|
||||
"version": "v1",
|
||||
"serviceId": "docs"
|
||||
}
|
||||
]
|
||||
},
|
||||
"exceptionLogging": "STACKDRIVER",
|
||||
"runtimeVersion": "V8",
|
||||
"executionApi": {
|
||||
"access": "MYSELF"
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,7 @@ from typing_extensions import override
|
||||
from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.app_configs import MAX_DRIVE_WORKERS
|
||||
from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.exceptions import CredentialExpiredError
|
||||
@@ -39,12 +40,16 @@ from onyx.connectors.google_drive.models import GoogleDriveFileType
|
||||
from onyx.connectors.google_drive.models import RetrievedDriveFile
|
||||
from onyx.connectors.google_drive.models import StageCompletion
|
||||
from onyx.connectors.google_utils.google_auth import get_google_creds
|
||||
from onyx.connectors.google_utils.google_utils import create_scripts_file_objects
|
||||
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from onyx.connectors.google_utils.google_utils import GoogleFields
|
||||
from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME
|
||||
from onyx.connectors.google_utils.resources import get_admin_service
|
||||
from onyx.connectors.google_utils.resources import get_drive_service
|
||||
from onyx.connectors.google_utils.resources import get_google_docs_service
|
||||
from onyx.connectors.google_utils.resources import get_google_scripts_service
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.google_utils.resources import GoogleScriptsService
|
||||
from onyx.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
@@ -90,6 +95,7 @@ def _convert_single_file(
|
||||
creds: Any,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
smart_chips_deployment_id: str,
|
||||
retriever_email: str,
|
||||
file: dict[str, Any],
|
||||
) -> Document | ConnectorFailure | None:
|
||||
@@ -107,10 +113,15 @@ def _convert_single_file(
|
||||
docs_service = lazy_eval(
|
||||
lambda: get_google_docs_service(creds, user_email=user_email)
|
||||
)
|
||||
scripts_service = lazy_eval(
|
||||
lambda: get_google_scripts_service(creds, user_email=user_email)
|
||||
)
|
||||
return convert_drive_item_to_document(
|
||||
file=file,
|
||||
drive_service=user_drive_service,
|
||||
docs_service=docs_service,
|
||||
scripts_service=scripts_service,
|
||||
smart_chips_deployment_id=smart_chips_deployment_id,
|
||||
allow_images=allow_images,
|
||||
size_threshold=size_threshold,
|
||||
)
|
||||
@@ -176,6 +187,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
my_drive_emails: str | None = None,
|
||||
shared_folder_urls: str | None = None,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
smart_chip_deployment_id: str = "",
|
||||
# OLD PARAMETERS
|
||||
folder_paths: list[str] | None = None,
|
||||
include_shared: bool | None = None,
|
||||
@@ -248,6 +260,8 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
self._retrieved_ids: set[str] = set()
|
||||
self.allow_images = False
|
||||
|
||||
self.smart_chip_deployment_id = smart_chip_deployment_id
|
||||
|
||||
self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
|
||||
|
||||
def set_allow_images(self, value: bool) -> None:
|
||||
@@ -295,8 +309,108 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
)
|
||||
|
||||
if USE_SMART_CHIP_SCOPES:
|
||||
self.upsert_smart_chip_app_script()
|
||||
|
||||
return new_creds_dict
|
||||
|
||||
@staticmethod
|
||||
def _get_latest_deployment(
|
||||
scripts_service: GoogleScriptsService, script_id: str
|
||||
) -> dict[str, Any]:
|
||||
deployments = (
|
||||
scripts_service.projects()
|
||||
.deployments()
|
||||
.list(
|
||||
scriptId=script_id,
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
all_deployments = deployments.get("deployments", [])
|
||||
while "nextPageToken" in deployments:
|
||||
deployments = (
|
||||
scripts_service.projects()
|
||||
.deployments()
|
||||
.list(
|
||||
scriptId=script_id,
|
||||
pageToken=deployments["nextPageToken"],
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
all_deployments.extend(deployments.get("deployments", []))
|
||||
|
||||
if len(all_deployments) == 0:
|
||||
raise RuntimeError(f"No deployments found for script {script_id}")
|
||||
return max(
|
||||
all_deployments,
|
||||
key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(),
|
||||
)
|
||||
|
||||
def upsert_smart_chip_app_script(self) -> None:
|
||||
assert self._creds is not None, "creds not set"
|
||||
|
||||
# If a deployment id is provided, we don't need to create a new script.
|
||||
# The deployment id can be retrieved by going under
|
||||
# Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com)
|
||||
if self.smart_chip_deployment_id:
|
||||
return
|
||||
|
||||
# Step 1: Check if the script already exists by searching the admin drive.
|
||||
drive_service = get_drive_service(
|
||||
self._creds, user_email=self.primary_admin_email
|
||||
)
|
||||
q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false"
|
||||
script_search = (
|
||||
drive_service.files()
|
||||
.list(
|
||||
corpora="user",
|
||||
fields="files(mimeType, id, name)",
|
||||
q=q,
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
script_id = (script_search.get("files") or [{}])[0].get("id")
|
||||
scripts_service = get_google_scripts_service(
|
||||
self._creds, user_email=self.primary_admin_email
|
||||
)
|
||||
if not script_id:
|
||||
# Step 2: Create the script if nonexistent
|
||||
# (Takes about ~10 seconds)
|
||||
req = scripts_service.projects().create(
|
||||
body={"title": SMART_CHIP_SCRIPT_FILE_NAME}
|
||||
)
|
||||
response = req.execute()
|
||||
|
||||
if "scriptId" not in response:
|
||||
raise RuntimeError(
|
||||
f"Failed to create Smart Chip App Script: {response}"
|
||||
)
|
||||
|
||||
script_id = response["scriptId"]
|
||||
scripts_files = create_scripts_file_objects()
|
||||
# Step 3: Update (upload) the script content
|
||||
response = (
|
||||
scripts_service.projects()
|
||||
.updateContent(scriptId=script_id, body={"files": scripts_files})
|
||||
.execute()
|
||||
)
|
||||
|
||||
if "scriptId" not in response:
|
||||
raise RuntimeError(
|
||||
f"Failed to update Smart Chip App Script: {response}"
|
||||
)
|
||||
|
||||
script_id = response["scriptId"]
|
||||
|
||||
# Step 4: Get the deployment id
|
||||
self.smart_chip_deployment_id = self._get_latest_deployment(
|
||||
scripts_service, script_id
|
||||
)["deploymentId"]
|
||||
|
||||
# TODO: upsert new version if out of date. We don't expect to do this often.
|
||||
# One way would be to check whether the script files have changed (either via git
|
||||
# or actually pulling the current content and comparing).
|
||||
|
||||
def _update_traversed_parent_ids(self, folder_id: str) -> None:
|
||||
self._retrieved_ids.add(folder_id)
|
||||
|
||||
@@ -952,6 +1066,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
self.creds,
|
||||
self.allow_images,
|
||||
self.size_threshold,
|
||||
self.smart_chip_deployment_id,
|
||||
)
|
||||
# Fetch files in batches
|
||||
batches_complete = 0
|
||||
|
||||
@@ -14,6 +14,7 @@ from onyx.connectors.google_drive.models import GoogleDriveFileType
|
||||
from onyx.connectors.google_drive.section_extraction import get_document_sections
|
||||
from onyx.connectors.google_utils.resources import GoogleDocsService
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.google_utils.resources import GoogleScriptsService
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
@@ -224,6 +225,8 @@ def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: Callable[[], GoogleDriveService],
|
||||
docs_service: Callable[[], GoogleDocsService],
|
||||
scripts_service: Callable[[], GoogleScriptsService],
|
||||
smart_chips_deployment_id: str,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
) -> Document | ConnectorFailure | None:
|
||||
@@ -244,7 +247,10 @@ def convert_drive_item_to_document(
|
||||
try:
|
||||
# get_document_sections is the advanced approach for Google Docs
|
||||
doc_sections = get_document_sections(
|
||||
docs_service=docs_service(), doc_id=file.get("id", "")
|
||||
docs_service=docs_service(),
|
||||
scripts_service=scripts_service(),
|
||||
smart_chips_deployment_id=smart_chips_deployment_id,
|
||||
doc_id=file.get("id", ""),
|
||||
)
|
||||
if doc_sections:
|
||||
sections = cast(list[TextSection | ImageSection], doc_sections)
|
||||
|
||||
@@ -1,23 +1,29 @@
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
|
||||
from onyx.connectors.google_utils.resources import GoogleDocsService
|
||||
from onyx.connectors.google_utils.resources import GoogleScriptsService
|
||||
from onyx.connectors.models import TextSection
|
||||
|
||||
|
||||
DRIVE_CHIP_CHAR = "\ue907"
|
||||
UNKNOWN_SMART_CHIP_STR = "<Smart Chip>"
|
||||
|
||||
|
||||
class CurrentHeading(BaseModel):
|
||||
id: str
|
||||
id: str | None
|
||||
text: str
|
||||
|
||||
|
||||
def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
|
||||
def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
|
||||
"""Builds a Google Doc link that jumps to a specific heading"""
|
||||
# NOTE: doesn't support docs with multiple tabs atm, if we need that ask
|
||||
# @Chris
|
||||
return (
|
||||
f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
|
||||
)
|
||||
heading_str = f"#heading={heading_id}" if heading_id else ""
|
||||
return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"
|
||||
|
||||
|
||||
def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
|
||||
@@ -25,81 +31,230 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
|
||||
return paragraph["paragraphStyle"]["headingId"]
|
||||
|
||||
|
||||
def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
|
||||
def _extract_text_from_paragraph(
|
||||
paragraph: dict[str, Any], extract_chip: Callable[[int], str | None]
|
||||
) -> tuple[str, int]:
|
||||
"""Extracts the text content from a paragraph element"""
|
||||
text_elements = []
|
||||
for element in paragraph.get("elements", []):
|
||||
if "textRun" in element:
|
||||
text_elements.append(element["textRun"].get("content", ""))
|
||||
return "".join(text_elements)
|
||||
|
||||
# Handle links
|
||||
if "textStyle" in element and "link" in element["textStyle"]:
|
||||
text_elements.append(f"({element['textStyle']['link'].get('url', '')})")
|
||||
|
||||
if "person" in element:
|
||||
name = element["person"].get("personProperties", {}).get("name", "")
|
||||
email = element["person"].get("personProperties", {}).get("email", "")
|
||||
person_str = "<Person|"
|
||||
if name:
|
||||
person_str += f"name: {name}, "
|
||||
if email:
|
||||
person_str += f"email: {email}"
|
||||
person_str += ">"
|
||||
text_elements.append(person_str)
|
||||
|
||||
if "richLink" in element:
|
||||
props = element["richLink"].get("richLinkProperties", {})
|
||||
title = props.get("title", "")
|
||||
uri = props.get("uri", "")
|
||||
link_str = f"[{title}]({uri})"
|
||||
text_elements.append(link_str)
|
||||
|
||||
ret = "".join(text_elements)
|
||||
|
||||
# add chip strings in place of each non-text
|
||||
text_chunks = ret.split(DRIVE_CHIP_CHAR)
|
||||
num_non_text_elements = len(text_chunks) - 1
|
||||
for i in range(num_non_text_elements):
|
||||
text_chunks[i] += extract_chip(i) or UNKNOWN_SMART_CHIP_STR
|
||||
return "".join(text_chunks), num_non_text_elements
|
||||
|
||||
|
||||
def _extract_smart_chips_from_document(
|
||||
document_id: str,
|
||||
scripts_service: GoogleScriptsService,
|
||||
deployment_id: str,
|
||||
) -> dict[str, str]:
|
||||
"""Extracts smart chips from a Google Doc. Returns a dictionary where
|
||||
the keys are the smart chip location keys and the values are the smart chip text.
|
||||
|
||||
This calls a Google Apps Script function, because most smart chips are not currently
|
||||
available through the API https://issuetracker.google.com/issues/225584757
|
||||
|
||||
Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum".
|
||||
nonTextIndexNum refers to the index at which the value was found while traversing
|
||||
the paragraph or table cell from left to right, top to bottom.
|
||||
|
||||
There are many non-text elements that are currently not supported by Apps Script, (see
|
||||
https://developers.google.com/apps-script/reference/document/element-type ), so some
|
||||
non-text elements won't have an associated text value.
|
||||
"""
|
||||
|
||||
# NOTE: the documentation is incorrect; the script id you must specify is
|
||||
# actually the deployment id (what comes up when you go to Deploy-> Test Deployments)
|
||||
http_request = scripts_service.scripts().run(
|
||||
scriptId=deployment_id,
|
||||
body={
|
||||
"function": "docToChips",
|
||||
"parameters": [document_id],
|
||||
# "devMode": True
|
||||
},
|
||||
)
|
||||
doc = http_request.execute()
|
||||
return doc.get("response", {}).get("result", {})
|
||||
|
||||
|
||||
def _extract_text_from_table(
|
||||
table: dict[str, Any], extract_chip: Callable[[int], str | None]
|
||||
) -> str:
|
||||
"""
|
||||
Extracts the text content from a table element.
|
||||
Smart chip extraction will be wrong for nested tables.
|
||||
"""
|
||||
row_strs = []
|
||||
seen_non_text = 0
|
||||
|
||||
def table_extract_chip(non_text_index: int) -> str | None:
|
||||
return extract_chip(non_text_index + seen_non_text)
|
||||
|
||||
for row in table.get("tableRows", []):
|
||||
cells = row.get("tableCells", [])
|
||||
cell_strs = []
|
||||
for cell in cells:
|
||||
child_elements = cell.get("content", {})
|
||||
cell_str = []
|
||||
for child_elem in child_elements:
|
||||
if "paragraph" not in child_elem:
|
||||
continue
|
||||
text, num_non_text_elements = _extract_text_from_paragraph(
|
||||
child_elem["paragraph"], table_extract_chip
|
||||
)
|
||||
cell_str.append(text)
|
||||
seen_non_text += num_non_text_elements
|
||||
|
||||
cell_strs.append("".join(cell_str))
|
||||
row_strs.append(", ".join(cell_strs))
|
||||
return "\n".join(row_strs)
|
||||
|
||||
|
||||
def get_document_sections(
|
||||
docs_service: GoogleDocsService,
|
||||
scripts_service: GoogleScriptsService,
|
||||
smart_chips_deployment_id: str,
|
||||
doc_id: str,
|
||||
) -> list[TextSection]:
|
||||
"""Extracts sections from a Google Doc, including their headings and content"""
|
||||
# Fetch the document structure
|
||||
doc = docs_service.documents().get(documentId=doc_id).execute()
|
||||
http_request = docs_service.documents().get(documentId=doc_id)
|
||||
|
||||
# Google has poor support for tabs in the docs api, see
|
||||
# https://cloud.google.com/python/docs/reference/cloudtasks/
|
||||
# latest/google.cloud.tasks_v2.types.HttpRequest
|
||||
# https://developers.google.com/workspace/docs/api/how-tos/tabs
|
||||
# https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
|
||||
# this is a hack to use the param mentioned in the rest api docs
|
||||
# TODO: check if it can be specified i.e. in documents()
|
||||
http_request.uri += "&includeTabsContent=true"
|
||||
doc = http_request.execute()
|
||||
|
||||
smart_chips = {}
|
||||
if USE_SMART_CHIP_SCOPES:
|
||||
# Get the smart chips
|
||||
smart_chips = _extract_smart_chips_from_document(
|
||||
doc_id, scripts_service, smart_chips_deployment_id
|
||||
)
|
||||
|
||||
# Get the content
|
||||
content = doc.get("body", {}).get("content", [])
|
||||
tabs = doc.get("tabs", {})
|
||||
sections: list[TextSection] = []
|
||||
for tab_num, tab in enumerate(tabs):
|
||||
sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips))
|
||||
return sections
|
||||
|
||||
|
||||
def _is_heading(paragraph: dict[str, Any]) -> bool:
|
||||
"""Checks if a paragraph (a block of text in a drive document) is a heading"""
|
||||
if not (
|
||||
"paragraphStyle" in paragraph
|
||||
and "namedStyleType" in paragraph["paragraphStyle"]
|
||||
):
|
||||
return False
|
||||
|
||||
style = paragraph["paragraphStyle"]["namedStyleType"]
|
||||
is_heading = style.startswith("HEADING_")
|
||||
is_title = style.startswith("TITLE")
|
||||
return is_heading or is_title
|
||||
|
||||
|
||||
def _add_finished_section(
|
||||
sections: list[TextSection],
|
||||
doc_id: str,
|
||||
tab_id: str,
|
||||
current_heading: CurrentHeading,
|
||||
current_section: list[str],
|
||||
) -> None:
|
||||
"""Adds a finished section to the list of sections if the section has content.
|
||||
Returns the list of sections to use going forward, which may be the old list
|
||||
if a new section was not added.
|
||||
"""
|
||||
if not (current_section or current_heading.text):
|
||||
return
|
||||
# If we were building a previous section, add it to sections list
|
||||
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
TextSection(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_tab_sections(
|
||||
tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str]
|
||||
) -> list[TextSection]:
|
||||
tab_id = tab["tabProperties"]["tabId"]
|
||||
content = tab.get("documentTab", {}).get("body", {}).get("content", [])
|
||||
|
||||
sections: list[TextSection] = []
|
||||
current_section: list[str] = []
|
||||
current_heading: CurrentHeading | None = None
|
||||
current_heading = CurrentHeading(id=None, text="")
|
||||
|
||||
for element in content:
|
||||
if "paragraph" not in element:
|
||||
continue
|
||||
for element_num, element in enumerate(content):
|
||||
|
||||
paragraph = element["paragraph"]
|
||||
def extract_chip(non_text_index: int) -> str | None:
|
||||
return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}")
|
||||
|
||||
# Check if this is a heading
|
||||
if (
|
||||
"paragraphStyle" in paragraph
|
||||
and "namedStyleType" in paragraph["paragraphStyle"]
|
||||
):
|
||||
style = paragraph["paragraphStyle"]["namedStyleType"]
|
||||
is_heading = style.startswith("HEADING_")
|
||||
is_title = style.startswith("TITLE")
|
||||
if "paragraph" in element:
|
||||
paragraph = element["paragraph"]
|
||||
|
||||
if is_heading or is_title:
|
||||
# If we were building a previous section, add it to sections list
|
||||
if current_heading is not None and current_section:
|
||||
heading_text = current_heading.text
|
||||
section_text = f"{heading_text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
TextSection(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, current_heading.id),
|
||||
)
|
||||
)
|
||||
current_section = []
|
||||
|
||||
# Start new heading
|
||||
heading_id = _extract_id_from_heading(paragraph)
|
||||
heading_text = _extract_text_from_paragraph(paragraph)
|
||||
current_heading = CurrentHeading(
|
||||
id=heading_id,
|
||||
text=heading_text,
|
||||
)
|
||||
# If this is not a heading, add content to current section
|
||||
if not _is_heading(paragraph):
|
||||
text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
|
||||
if text.strip():
|
||||
current_section.append(text)
|
||||
continue
|
||||
|
||||
# Add content to current section
|
||||
if current_heading is not None:
|
||||
text = _extract_text_from_paragraph(paragraph)
|
||||
_add_finished_section(
|
||||
sections, doc_id, tab_id, current_heading, current_section
|
||||
)
|
||||
|
||||
current_section = []
|
||||
|
||||
# Start new heading
|
||||
heading_id = _extract_id_from_heading(paragraph)
|
||||
heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
|
||||
current_heading = CurrentHeading(
|
||||
id=heading_id,
|
||||
text=heading_text,
|
||||
)
|
||||
elif "table" in element:
|
||||
text = _extract_text_from_table(element["table"], extract_chip)
|
||||
if text.strip():
|
||||
current_section.append(text)
|
||||
|
||||
# Don't forget to add the last section
|
||||
if current_heading is not None and current_section:
|
||||
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
TextSection(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, current_heading.id),
|
||||
)
|
||||
)
|
||||
_add_finished_section(sections, doc_id, tab_id, current_heading, current_section)
|
||||
|
||||
return sections
|
||||
|
||||
132
backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
Normal file
132
backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
Normal file
@@ -0,0 +1,132 @@
|
||||
|
||||
/**
|
||||
* Retrieves the given Google doc by id and extracts dates, people, and rich links
|
||||
* from it into a JSON keyed by tab, paragraph, and non-text-component index.
|
||||
*
|
||||
*/
|
||||
function docToChips(document_id) {
|
||||
const doc = DocumentApp.openById(document_id);
|
||||
const tabs = doc.getTabs();
|
||||
const ret = new Map();
|
||||
tabs.map((tab, tabInd) => {
|
||||
const docTab = tab.asDocumentTab();
|
||||
const body = docTab.getBody();
|
||||
for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) {
|
||||
var tabChild = body.getChild(tabChildInd);
|
||||
var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;});
|
||||
switch (tabChild.getType()) {
|
||||
case DocumentApp.ElementType.PARAGRAPH:
|
||||
parseParagraph(tabChild.asParagraph(), callback);
|
||||
console.log("paragraph", tabChild.asParagraph().getText());
|
||||
break;
|
||||
case DocumentApp.ElementType.TABLE:
|
||||
console.log("table");
|
||||
parseTable(tabChild.asTable(), callback);
|
||||
break;
|
||||
case DocumentApp.ElementType.LIST_ITEM:
|
||||
var listItem = tabChild.asListItem();
|
||||
//console.log("list item:", listItem.getText(), listItem.getNumChildren());
|
||||
//console.log(listItem.getChild(0).asText().getText());
|
||||
parseParagraph(tabChild.asListItem(), callback);
|
||||
break;
|
||||
default:
|
||||
console.log("found unknown tab body child of type: ", tabChild.getType().toString());
|
||||
}
|
||||
}
|
||||
});
|
||||
console.log(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// uncomment and paste in a file id (and change the main function to "test")
|
||||
// to test the docToChips function
|
||||
// function test() {
|
||||
// return docToChips("document id goes here");
|
||||
// }
|
||||
|
||||
function getKey(tabInd, paragraphInd, nonTextInd) {
|
||||
return tabInd + "_" + paragraphInd + "_" + nonTextInd;
|
||||
}
|
||||
|
||||
// also used for list items
|
||||
function parseParagraph(paragraph, callback) {
|
||||
var nonTextInd = 0;
|
||||
for (let i = 0; i < paragraph.getNumChildren(); i++) { //
|
||||
var child = paragraph.getChild(i);
|
||||
switch (child.getType()) {
|
||||
case DocumentApp.ElementType.DATE:
|
||||
console.log(child.asDate().getDisplayText());
|
||||
callback(nonTextInd, child.asDate().getDisplayText());
|
||||
break;
|
||||
case DocumentApp.ElementType.EQUATION:
|
||||
var eqStr = child.getText();
|
||||
console.log("equation: ", eqStr);
|
||||
callback(nonTextInd, eqStr);
|
||||
break;
|
||||
case DocumentApp.ElementType.PERSON:
|
||||
var personStr = "<name: " + child.asPerson().getName() + ", email: "+ child.asPerson().getEmail() + ">";
|
||||
console.log(personStr);
|
||||
//callback(nonTextInd, personStr);
|
||||
nonTextInd--; // Advanced Docs API picks up people
|
||||
break;
|
||||
case DocumentApp.ElementType.RICH_LINK:
|
||||
var richLink = child.asRichLink()
|
||||
var linkStr = "<title: " + richLink.getTitle() + ", type:" + richLink.getMimeType() + ">"
|
||||
console.log(linkStr);
|
||||
// callback(nonTextInd, child.asRichLink().getUrl());
|
||||
nonTextInd--; // Advanced Docs API picks up rich links
|
||||
break;
|
||||
case DocumentApp.ElementType.TEXT:
|
||||
console.log("text: "+ child.asText().getText());
|
||||
//console.log(child.asText().)
|
||||
nonTextInd--;
|
||||
break;
|
||||
case DocumentApp.ElementType.UNSUPPORTED:
|
||||
console.log("unsupported element type");
|
||||
break;
|
||||
default:
|
||||
console.log("found special element type:", child.getType().toString());
|
||||
}
|
||||
nonTextInd++;
|
||||
}
|
||||
}
|
||||
|
||||
function parseTable(table, callback) {
|
||||
var lastSeenInCell = 0;
|
||||
var allSeenElems = 0
|
||||
const tableCallback = ((nonTextInd, replaceText) => {
|
||||
callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText);
|
||||
lastSeenInCell++;
|
||||
});
|
||||
for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) {
|
||||
var row = table.getChild(rowInd);
|
||||
if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) {
|
||||
console.log("table child type: ", row.getType().toString());
|
||||
continue;
|
||||
}
|
||||
|
||||
for (let colInd = 0; colInd < row.getNumChildren(); colInd++) {
|
||||
var cell = row.getChild(colInd);
|
||||
if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) {
|
||||
console.log("row child type: ", cell.getType().toString());
|
||||
continue;
|
||||
}
|
||||
|
||||
for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) {
|
||||
var item = cell.getChild(itemInd);
|
||||
console.log(item.getType().toString());
|
||||
switch (item.getType()) {
|
||||
case DocumentApp.ElementType.PARAGRAPH:
|
||||
case DocumentApp.ElementType.LIST_ITEM:
|
||||
parseParagraph(item, tableCallback);
|
||||
break;
|
||||
case DocumentApp.ElementType.TABLE:
|
||||
parseTable(item, tableCallback);
|
||||
break;
|
||||
}
|
||||
}
|
||||
allSeenElems += lastSeenInCell;
|
||||
lastSeenInCell = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
@@ -141,3 +142,50 @@ def execute_paginated_retrieval(
|
||||
yield item
|
||||
else:
|
||||
yield results
|
||||
|
||||
|
||||
# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType
|
||||
class AppsScriptFileType(str, Enum):
|
||||
UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED"
|
||||
SERVER_JS = "SERVER_JS"
|
||||
HTML = "HTML"
|
||||
JSON = "JSON"
|
||||
|
||||
|
||||
SMART_CHIP_RETRIEVAL_FUNCTIONS = [
|
||||
("docToChips", ["document_id"]),
|
||||
("getKey", ["tabInd", "paragraphInd", "nonTextInd"]),
|
||||
("parseParagraph", ["paragraph", "callback"]),
|
||||
("parseTable", ["table", "callback"]),
|
||||
]
|
||||
|
||||
SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor"
|
||||
|
||||
|
||||
# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent
|
||||
def create_scripts_file_objects() -> list[GoogleDriveFileType]:
|
||||
with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f:
|
||||
script_source = f.read()
|
||||
with open("onyx/connectors/google_drive/appsscript.json", "r") as f:
|
||||
appsscript_source = json.loads(f.read())
|
||||
return [
|
||||
{
|
||||
"name": "appsscript",
|
||||
"type": AppsScriptFileType.JSON.value,
|
||||
"source": json.dumps(appsscript_source),
|
||||
},
|
||||
{
|
||||
"name": SMART_CHIP_SCRIPT_FILE_NAME,
|
||||
"type": AppsScriptFileType.SERVER_JS.value,
|
||||
"source": script_source,
|
||||
"functionSet": {
|
||||
"values": [
|
||||
{
|
||||
"name": name,
|
||||
"parameters": params,
|
||||
}
|
||||
for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
@@ -12,6 +12,10 @@ class GoogleDocsService(Resource):
|
||||
pass
|
||||
|
||||
|
||||
class GoogleScriptsService(Resource):
|
||||
pass
|
||||
|
||||
|
||||
class AdminService(Resource):
|
||||
pass
|
||||
|
||||
@@ -62,3 +66,10 @@ def get_gmail_service(
|
||||
user_email: str | None = None,
|
||||
) -> GmailService:
|
||||
return _get_google_service("gmail", "v1", creds, user_email)
|
||||
|
||||
|
||||
def get_google_scripts_service(
|
||||
creds: ServiceAccountCredentials | OAuthCredentials,
|
||||
user_email: str | None = None,
|
||||
) -> GoogleScriptsService:
|
||||
return _get_google_service("script", "v1", creds, user_email)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from enum import Enum as PyEnum
|
||||
|
||||
from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
|
||||
from onyx.configs.constants import DocumentSource
|
||||
|
||||
# NOTE: do not need https://www.googleapis.com/auth/documents.readonly
|
||||
@@ -18,6 +19,19 @@ GOOGLE_SCOPES = {
|
||||
],
|
||||
}
|
||||
|
||||
# TODO: add this to the docs
|
||||
GOOGLE_SMART_CHIP_SCOPES = [
|
||||
"https://www.googleapis.com/auth/script.external_request",
|
||||
"https://www.googleapis.com/auth/drive.scripts",
|
||||
"https://www.googleapis.com/auth/script.scriptapp",
|
||||
"https://www.googleapis.com/auth/script.deployments",
|
||||
"https://www.googleapis.com/auth/script.projects",
|
||||
"https://www.googleapis.com/auth/documents",
|
||||
]
|
||||
|
||||
if USE_SMART_CHIP_SCOPES:
|
||||
GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES
|
||||
|
||||
# This is the Oauth token
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
|
||||
# This is the service account key
|
||||
|
||||
Reference in New Issue
Block a user