Compare commits

...

3 Commits

Author SHA1 Message Date
Evan Lohn
81e3975a09 rebased and added scripts 2025-04-04 16:53:30 -07:00
Evan Lohn
0781709a56 WIP almost done, but realized we can just do basic retrieval 2025-04-04 16:52:33 -07:00
Evan Lohn
93529b081c WIP 2025-04-04 16:52:33 -07:00
10 changed files with 558 additions and 62 deletions

View File

@@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
selected_search_docs=selected_db_search_docs,
# Deduping happens at the last step to avoid harming quality by dropping content early on
# Skip deduping completely for ordering-only mode to save time
dedupe_docs=(
False
if search_for_ordering_only
else (
retrieval_options.dedupe_docs
if retrieval_options
else False
)
dedupe_docs=bool(
not search_for_ordering_only
and retrieval_options
and retrieval_options.dedupe_docs
),
user_files=user_file_files if search_for_ordering_only else [],
loaded_user_files=user_files

View File

@@ -163,6 +163,8 @@ INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE") or 16)
MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4))
USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true"
# Below are intended to match the env variables names used by the official postgres docker image
# https://hub.docker.com/_/postgres
POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"

View File

@@ -0,0 +1,17 @@
{
"timeZone": "America/Los_Angeles",
"dependencies": {
"enabledAdvancedServices": [
{
"userSymbol": "Docs",
"version": "v1",
"serviceId": "docs"
}
]
},
"exceptionLogging": "STACKDRIVER",
"runtimeVersion": "V8",
"executionApi": {
"access": "MYSELF"
}
}

View File

@@ -18,6 +18,7 @@ from typing_extensions import override
from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.app_configs import MAX_DRIVE_WORKERS
from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
from onyx.configs.constants import DocumentSource
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.exceptions import CredentialExpiredError
@@ -39,12 +40,16 @@ from onyx.connectors.google_drive.models import GoogleDriveFileType
from onyx.connectors.google_drive.models import RetrievedDriveFile
from onyx.connectors.google_drive.models import StageCompletion
from onyx.connectors.google_utils.google_auth import get_google_creds
from onyx.connectors.google_utils.google_utils import create_scripts_file_objects
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.connectors.google_utils.google_utils import GoogleFields
from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME
from onyx.connectors.google_utils.resources import get_admin_service
from onyx.connectors.google_utils.resources import get_drive_service
from onyx.connectors.google_utils.resources import get_google_docs_service
from onyx.connectors.google_utils.resources import get_google_scripts_service
from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.google_utils.resources import GoogleScriptsService
from onyx.connectors.google_utils.shared_constants import (
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
)
@@ -90,6 +95,7 @@ def _convert_single_file(
creds: Any,
allow_images: bool,
size_threshold: int,
smart_chips_deployment_id: str,
retriever_email: str,
file: dict[str, Any],
) -> Document | ConnectorFailure | None:
@@ -107,10 +113,15 @@ def _convert_single_file(
docs_service = lazy_eval(
lambda: get_google_docs_service(creds, user_email=user_email)
)
scripts_service = lazy_eval(
lambda: get_google_scripts_service(creds, user_email=user_email)
)
return convert_drive_item_to_document(
file=file,
drive_service=user_drive_service,
docs_service=docs_service,
scripts_service=scripts_service,
smart_chips_deployment_id=smart_chips_deployment_id,
allow_images=allow_images,
size_threshold=size_threshold,
)
@@ -176,6 +187,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
my_drive_emails: str | None = None,
shared_folder_urls: str | None = None,
batch_size: int = INDEX_BATCH_SIZE,
smart_chip_deployment_id: str = "",
# OLD PARAMETERS
folder_paths: list[str] | None = None,
include_shared: bool | None = None,
@@ -248,6 +260,8 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
self._retrieved_ids: set[str] = set()
self.allow_images = False
self.smart_chip_deployment_id = smart_chip_deployment_id
self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
def set_allow_images(self, value: bool) -> None:
@@ -295,8 +309,108 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
source=DocumentSource.GOOGLE_DRIVE,
)
if USE_SMART_CHIP_SCOPES:
self.upsert_smart_chip_app_script()
return new_creds_dict
@staticmethod
def _get_latest_deployment(
scripts_service: GoogleScriptsService, script_id: str
) -> dict[str, Any]:
deployments = (
scripts_service.projects()
.deployments()
.list(
scriptId=script_id,
)
.execute()
)
all_deployments = deployments.get("deployments", [])
while "nextPageToken" in deployments:
deployments = (
scripts_service.projects()
.deployments()
.list(
scriptId=script_id,
pageToken=deployments["nextPageToken"],
)
.execute()
)
all_deployments.extend(deployments.get("deployments", []))
if len(all_deployments) == 0:
raise RuntimeError(f"No deployments found for script {script_id}")
return max(
all_deployments,
key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(),
)
def upsert_smart_chip_app_script(self) -> None:
assert self._creds is not None, "creds not set"
# If a deployment id is provided, we don't need to create a new script.
# The deployment id can be retrieved by going under
# Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com)
if self.smart_chip_deployment_id:
return
# Step 1: Check if the script already exists by searching the admin drive.
drive_service = get_drive_service(
self._creds, user_email=self.primary_admin_email
)
q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false"
script_search = (
drive_service.files()
.list(
corpora="user",
fields="files(mimeType, id, name)",
q=q,
)
.execute()
)
script_id = (script_search.get("files") or [{}])[0].get("id")
scripts_service = get_google_scripts_service(
self._creds, user_email=self.primary_admin_email
)
if not script_id:
# Step 2: Create the script if nonexistent
# (Takes about ~10 seconds)
req = scripts_service.projects().create(
body={"title": SMART_CHIP_SCRIPT_FILE_NAME}
)
response = req.execute()
if "scriptId" not in response:
raise RuntimeError(
f"Failed to create Smart Chip App Script: {response}"
)
script_id = response["scriptId"]
scripts_files = create_scripts_file_objects()
# Step 3: Update (upload) the script content
response = (
scripts_service.projects()
.updateContent(scriptId=script_id, body={"files": scripts_files})
.execute()
)
if "scriptId" not in response:
raise RuntimeError(
f"Failed to update Smart Chip App Script: {response}"
)
script_id = response["scriptId"]
# Step 4: Get the deployment id
self.smart_chip_deployment_id = self._get_latest_deployment(
scripts_service, script_id
)["deploymentId"]
# TODO: upsert new version if out of date. We don't expect to do this often.
# One way would be to check whether the script files have changed (either via git
# or actually pulling the current content and comparing).
def _update_traversed_parent_ids(self, folder_id: str) -> None:
self._retrieved_ids.add(folder_id)
@@ -952,6 +1066,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
self.creds,
self.allow_images,
self.size_threshold,
self.smart_chip_deployment_id,
)
# Fetch files in batches
batches_complete = 0

View File

@@ -14,6 +14,7 @@ from onyx.connectors.google_drive.models import GoogleDriveFileType
from onyx.connectors.google_drive.section_extraction import get_document_sections
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.google_utils.resources import GoogleScriptsService
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
@@ -224,6 +225,8 @@ def convert_drive_item_to_document(
file: GoogleDriveFileType,
drive_service: Callable[[], GoogleDriveService],
docs_service: Callable[[], GoogleDocsService],
scripts_service: Callable[[], GoogleScriptsService],
smart_chips_deployment_id: str,
allow_images: bool,
size_threshold: int,
) -> Document | ConnectorFailure | None:
@@ -244,7 +247,10 @@ def convert_drive_item_to_document(
try:
# get_document_sections is the advanced approach for Google Docs
doc_sections = get_document_sections(
docs_service=docs_service(), doc_id=file.get("id", "")
docs_service=docs_service(),
scripts_service=scripts_service(),
smart_chips_deployment_id=smart_chips_deployment_id,
doc_id=file.get("id", ""),
)
if doc_sections:
sections = cast(list[TextSection | ImageSection], doc_sections)

View File

@@ -1,23 +1,29 @@
from collections.abc import Callable
from typing import Any
from pydantic import BaseModel
from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.google_utils.resources import GoogleScriptsService
from onyx.connectors.models import TextSection
DRIVE_CHIP_CHAR = "\ue907"
UNKNOWN_SMART_CHIP_STR = "<Smart Chip>"
class CurrentHeading(BaseModel):
id: str
id: str | None
text: str
def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
"""Builds a Google Doc link that jumps to a specific heading"""
# NOTE: doesn't support docs with multiple tabs atm, if we need that ask
# @Chris
return (
f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
)
heading_str = f"#heading={heading_id}" if heading_id else ""
return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"
def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
@@ -25,81 +31,230 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
return paragraph["paragraphStyle"]["headingId"]
def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
def _extract_text_from_paragraph(
paragraph: dict[str, Any], extract_chip: Callable[[int], str | None]
) -> tuple[str, int]:
"""Extracts the text content from a paragraph element"""
text_elements = []
for element in paragraph.get("elements", []):
if "textRun" in element:
text_elements.append(element["textRun"].get("content", ""))
return "".join(text_elements)
# Handle links
if "textStyle" in element and "link" in element["textStyle"]:
text_elements.append(f"({element['textStyle']['link'].get('url', '')})")
if "person" in element:
name = element["person"].get("personProperties", {}).get("name", "")
email = element["person"].get("personProperties", {}).get("email", "")
person_str = "<Person|"
if name:
person_str += f"name: {name}, "
if email:
person_str += f"email: {email}"
person_str += ">"
text_elements.append(person_str)
if "richLink" in element:
props = element["richLink"].get("richLinkProperties", {})
title = props.get("title", "")
uri = props.get("uri", "")
link_str = f"[{title}]({uri})"
text_elements.append(link_str)
ret = "".join(text_elements)
# add chip strings in place of each non-text
text_chunks = ret.split(DRIVE_CHIP_CHAR)
num_non_text_elements = len(text_chunks) - 1
for i in range(num_non_text_elements):
text_chunks[i] += extract_chip(i) or UNKNOWN_SMART_CHIP_STR
return "".join(text_chunks), num_non_text_elements
def _extract_smart_chips_from_document(
document_id: str,
scripts_service: GoogleScriptsService,
deployment_id: str,
) -> dict[str, str]:
"""Extracts smart chips from a Google Doc. Returns a dictionary where
the keys are the smart chip location keys and the values are the smart chip text.
This calls a Google Apps Script function, because most smart chips are not currently
available through the API https://issuetracker.google.com/issues/225584757
Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum".
nonTextIndexNum refers to the index at which the value was found while traversing
the paragraph or table cell from left to right, top to bottom.
There are many non-text elements that are currently not supported by Apps Script, (see
https://developers.google.com/apps-script/reference/document/element-type ), so some
non-text elements won't have an associated text value.
"""
# NOTE: the documentation is incorrect; the script id you must specify is
# actually the deployment id (what comes up when you go to Deploy-> Test Deployments)
http_request = scripts_service.scripts().run(
scriptId=deployment_id,
body={
"function": "docToChips",
"parameters": [document_id],
# "devMode": True
},
)
doc = http_request.execute()
return doc.get("response", {}).get("result", {})
def _extract_text_from_table(
table: dict[str, Any], extract_chip: Callable[[int], str | None]
) -> str:
"""
Extracts the text content from a table element.
Smart chip extraction will be wrong for nested tables.
"""
row_strs = []
seen_non_text = 0
def table_extract_chip(non_text_index: int) -> str | None:
return extract_chip(non_text_index + seen_non_text)
for row in table.get("tableRows", []):
cells = row.get("tableCells", [])
cell_strs = []
for cell in cells:
child_elements = cell.get("content", {})
cell_str = []
for child_elem in child_elements:
if "paragraph" not in child_elem:
continue
text, num_non_text_elements = _extract_text_from_paragraph(
child_elem["paragraph"], table_extract_chip
)
cell_str.append(text)
seen_non_text += num_non_text_elements
cell_strs.append("".join(cell_str))
row_strs.append(", ".join(cell_strs))
return "\n".join(row_strs)
def get_document_sections(
docs_service: GoogleDocsService,
scripts_service: GoogleScriptsService,
smart_chips_deployment_id: str,
doc_id: str,
) -> list[TextSection]:
"""Extracts sections from a Google Doc, including their headings and content"""
# Fetch the document structure
doc = docs_service.documents().get(documentId=doc_id).execute()
http_request = docs_service.documents().get(documentId=doc_id)
# Google has poor support for tabs in the docs api, see
# https://cloud.google.com/python/docs/reference/cloudtasks/
# latest/google.cloud.tasks_v2.types.HttpRequest
# https://developers.google.com/workspace/docs/api/how-tos/tabs
# https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
# this is a hack to use the param mentioned in the rest api docs
# TODO: check if it can be specified i.e. in documents()
http_request.uri += "&includeTabsContent=true"
doc = http_request.execute()
smart_chips = {}
if USE_SMART_CHIP_SCOPES:
# Get the smart chips
smart_chips = _extract_smart_chips_from_document(
doc_id, scripts_service, smart_chips_deployment_id
)
# Get the content
content = doc.get("body", {}).get("content", [])
tabs = doc.get("tabs", {})
sections: list[TextSection] = []
for tab_num, tab in enumerate(tabs):
sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips))
return sections
def _is_heading(paragraph: dict[str, Any]) -> bool:
"""Checks if a paragraph (a block of text in a drive document) is a heading"""
if not (
"paragraphStyle" in paragraph
and "namedStyleType" in paragraph["paragraphStyle"]
):
return False
style = paragraph["paragraphStyle"]["namedStyleType"]
is_heading = style.startswith("HEADING_")
is_title = style.startswith("TITLE")
return is_heading or is_title
def _add_finished_section(
sections: list[TextSection],
doc_id: str,
tab_id: str,
current_heading: CurrentHeading,
current_section: list[str],
) -> None:
"""Adds a finished section to the list of sections if the section has content.
Returns the list of sections to use going forward, which may be the old list
if a new section was not added.
"""
if not (current_section or current_heading.text):
return
# If we were building a previous section, add it to sections list
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
sections.append(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
)
)
def get_tab_sections(
tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str]
) -> list[TextSection]:
tab_id = tab["tabProperties"]["tabId"]
content = tab.get("documentTab", {}).get("body", {}).get("content", [])
sections: list[TextSection] = []
current_section: list[str] = []
current_heading: CurrentHeading | None = None
current_heading = CurrentHeading(id=None, text="")
for element in content:
if "paragraph" not in element:
continue
for element_num, element in enumerate(content):
paragraph = element["paragraph"]
def extract_chip(non_text_index: int) -> str | None:
return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}")
# Check if this is a heading
if (
"paragraphStyle" in paragraph
and "namedStyleType" in paragraph["paragraphStyle"]
):
style = paragraph["paragraphStyle"]["namedStyleType"]
is_heading = style.startswith("HEADING_")
is_title = style.startswith("TITLE")
if "paragraph" in element:
paragraph = element["paragraph"]
if is_heading or is_title:
# If we were building a previous section, add it to sections list
if current_heading is not None and current_section:
heading_text = current_heading.text
section_text = f"{heading_text}\n" + "\n".join(current_section)
sections.append(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, current_heading.id),
)
)
current_section = []
# Start new heading
heading_id = _extract_id_from_heading(paragraph)
heading_text = _extract_text_from_paragraph(paragraph)
current_heading = CurrentHeading(
id=heading_id,
text=heading_text,
)
# If this is not a heading, add content to current section
if not _is_heading(paragraph):
text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
if text.strip():
current_section.append(text)
continue
# Add content to current section
if current_heading is not None:
text = _extract_text_from_paragraph(paragraph)
_add_finished_section(
sections, doc_id, tab_id, current_heading, current_section
)
current_section = []
# Start new heading
heading_id = _extract_id_from_heading(paragraph)
heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
current_heading = CurrentHeading(
id=heading_id,
text=heading_text,
)
elif "table" in element:
text = _extract_text_from_table(element["table"], extract_chip)
if text.strip():
current_section.append(text)
# Don't forget to add the last section
if current_heading is not None and current_section:
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
sections.append(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, current_heading.id),
)
)
_add_finished_section(sections, doc_id, tab_id, current_heading, current_section)
return sections

View File

@@ -0,0 +1,132 @@
/**
* Retrieves the given Google doc by id and extracts dates, people, and rich links
* from it into a JSON keyed by tab, paragraph, and non-text-component index.
*
*/
function docToChips(document_id) {
const doc = DocumentApp.openById(document_id);
const tabs = doc.getTabs();
const ret = new Map();
tabs.map((tab, tabInd) => {
const docTab = tab.asDocumentTab();
const body = docTab.getBody();
for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) {
var tabChild = body.getChild(tabChildInd);
var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;});
switch (tabChild.getType()) {
case DocumentApp.ElementType.PARAGRAPH:
parseParagraph(tabChild.asParagraph(), callback);
console.log("paragraph", tabChild.asParagraph().getText());
break;
case DocumentApp.ElementType.TABLE:
console.log("table");
parseTable(tabChild.asTable(), callback);
break;
case DocumentApp.ElementType.LIST_ITEM:
var listItem = tabChild.asListItem();
//console.log("list item:", listItem.getText(), listItem.getNumChildren());
//console.log(listItem.getChild(0).asText().getText());
parseParagraph(tabChild.asListItem(), callback);
break;
default:
console.log("found unknown tab body child of type: ", tabChild.getType().toString());
}
}
});
console.log(ret);
return ret;
}
// uncomment and paste in a file id (and change the main function to "test")
// to test the docToChips function
// function test() {
// return docToChips("document id goes here");
// }
function getKey(tabInd, paragraphInd, nonTextInd) {
return tabInd + "_" + paragraphInd + "_" + nonTextInd;
}
// also used for list items
function parseParagraph(paragraph, callback) {
var nonTextInd = 0;
for (let i = 0; i < paragraph.getNumChildren(); i++) { //
var child = paragraph.getChild(i);
switch (child.getType()) {
case DocumentApp.ElementType.DATE:
console.log(child.asDate().getDisplayText());
callback(nonTextInd, child.asDate().getDisplayText());
break;
case DocumentApp.ElementType.EQUATION:
var eqStr = child.getText();
console.log("equation: ", eqStr);
callback(nonTextInd, eqStr);
break;
case DocumentApp.ElementType.PERSON:
var personStr = "<name: " + child.asPerson().getName() + ", email: "+ child.asPerson().getEmail() + ">";
console.log(personStr);
//callback(nonTextInd, personStr);
nonTextInd--; // Advanced Docs API picks up people
break;
case DocumentApp.ElementType.RICH_LINK:
var richLink = child.asRichLink()
var linkStr = "<title: " + richLink.getTitle() + ", type:" + richLink.getMimeType() + ">"
console.log(linkStr);
// callback(nonTextInd, child.asRichLink().getUrl());
nonTextInd--; // Advanced Docs API picks up rich links
break;
case DocumentApp.ElementType.TEXT:
console.log("text: "+ child.asText().getText());
//console.log(child.asText().)
nonTextInd--;
break;
case DocumentApp.ElementType.UNSUPPORTED:
console.log("unsupported element type");
break;
default:
console.log("found special element type:", child.getType().toString());
}
nonTextInd++;
}
}
function parseTable(table, callback) {
var lastSeenInCell = 0;
var allSeenElems = 0
const tableCallback = ((nonTextInd, replaceText) => {
callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText);
lastSeenInCell++;
});
for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) {
var row = table.getChild(rowInd);
if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) {
console.log("table child type: ", row.getType().toString());
continue;
}
for (let colInd = 0; colInd < row.getNumChildren(); colInd++) {
var cell = row.getChild(colInd);
if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) {
console.log("row child type: ", cell.getType().toString());
continue;
}
for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) {
var item = cell.getChild(itemInd);
console.log(item.getType().toString());
switch (item.getType()) {
case DocumentApp.ElementType.PARAGRAPH:
case DocumentApp.ElementType.LIST_ITEM:
parseParagraph(item, tableCallback);
break;
case DocumentApp.ElementType.TABLE:
parseTable(item, tableCallback);
break;
}
}
allSeenElems += lastSeenInCell;
lastSeenInCell = 0;
}
}
}

View File

@@ -1,3 +1,4 @@
import json
import re
import time
from collections.abc import Callable
@@ -141,3 +142,50 @@ def execute_paginated_retrieval(
yield item
else:
yield results
# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType
class AppsScriptFileType(str, Enum):
UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED"
SERVER_JS = "SERVER_JS"
HTML = "HTML"
JSON = "JSON"
SMART_CHIP_RETRIEVAL_FUNCTIONS = [
("docToChips", ["document_id"]),
("getKey", ["tabInd", "paragraphInd", "nonTextInd"]),
("parseParagraph", ["paragraph", "callback"]),
("parseTable", ["table", "callback"]),
]
SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor"
# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent
def create_scripts_file_objects() -> list[GoogleDriveFileType]:
with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f:
script_source = f.read()
with open("onyx/connectors/google_drive/appsscript.json", "r") as f:
appsscript_source = json.loads(f.read())
return [
{
"name": "appsscript",
"type": AppsScriptFileType.JSON.value,
"source": json.dumps(appsscript_source),
},
{
"name": SMART_CHIP_SCRIPT_FILE_NAME,
"type": AppsScriptFileType.SERVER_JS.value,
"source": script_source,
"functionSet": {
"values": [
{
"name": name,
"parameters": params,
}
for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS
],
},
},
]

View File

@@ -12,6 +12,10 @@ class GoogleDocsService(Resource):
pass
class GoogleScriptsService(Resource):
pass
class AdminService(Resource):
pass
@@ -62,3 +66,10 @@ def get_gmail_service(
user_email: str | None = None,
) -> GmailService:
return _get_google_service("gmail", "v1", creds, user_email)
def get_google_scripts_service(
creds: ServiceAccountCredentials | OAuthCredentials,
user_email: str | None = None,
) -> GoogleScriptsService:
return _get_google_service("script", "v1", creds, user_email)

View File

@@ -1,5 +1,6 @@
from enum import Enum as PyEnum
from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
from onyx.configs.constants import DocumentSource
# NOTE: do not need https://www.googleapis.com/auth/documents.readonly
@@ -18,6 +19,19 @@ GOOGLE_SCOPES = {
],
}
# TODO: add this to the docs
GOOGLE_SMART_CHIP_SCOPES = [
"https://www.googleapis.com/auth/script.external_request",
"https://www.googleapis.com/auth/drive.scripts",
"https://www.googleapis.com/auth/script.scriptapp",
"https://www.googleapis.com/auth/script.deployments",
"https://www.googleapis.com/auth/script.projects",
"https://www.googleapis.com/auth/documents",
]
if USE_SMART_CHIP_SCOPES:
GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES
# This is the Oauth token
DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
# This is the service account key