Compare commits

...

2 Commits

Author SHA1 Message Date
Evan Lohn
64f61a5c36 added todo 2025-07-24 14:18:10 -07:00
Evan Lohn
067f848390 fix: drive external links 2025-07-24 14:14:46 -07:00

View File

@@ -341,8 +341,21 @@ def docx_to_text_and_images(
for rel_id, rel in doc.part.rels.items():
if "image" in rel.reltype:
# image is typically in rel.target_part.blob
image_bytes = rel.target_part.blob
# TODO: we should attempt to get the image from the external source if possible.
# We'll sometimes have to give up if the image is on i.e. some private network
# the container can't access, but best effort would be nice.
# Skip images that are linked rather than embedded (TargetMode="External")
if getattr(rel, "is_external", False):
continue
try:
# image is typically in rel.target_part.blob
image_bytes = rel.target_part.blob
except ValueError:
# Safeguard against relationships that lack an internal target_part
# (e.g., external relationships or other anomalies)
continue
image_name = rel.target_part.partname
# store
embedded_images.append((image_bytes, os.path.basename(str(image_name))))