Skip to content

Commit 40b66fa

Browse files
author
Jonathan Sprauel
committed
trying windows search api for mail search
1 parent b536c45 commit 40b66fa

1 file changed

Lines changed: 80 additions & 54 deletions

File tree

tools.py

Lines changed: 80 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -644,28 +644,48 @@ def _save_outlook_message(message, folder_name_source: str) -> str:
644644
"""Helper to save an Outlook message and its attachments."""
645645
nonlocal processed_count
646646
try:
647-
# Generate unique ID based on subject and body
648-
subj = getattr(message, 'Subject', '') or ''
647+
# Filter for emails specifically if possible, but handle others
648+
# olMail=43, olAppointment=26, olMeetingRequest=53
649+
item_class = getattr(message, 'Class', 43)
650+
651+
subj = getattr(message, 'Subject', 'No Subject') or 'No Subject'
649652
body = getattr(message, 'Body', '') or ''
650-
unique_id = hashlib.sha256((subj + body).encode('utf-8', errors='ignore')).hexdigest()
653+
html_body = getattr(message, 'HTMLBody', '') or ''
651654

652-
sender_name = getattr(message, 'SenderName', 'Unknown')
653-
folder_name = f"{sanitize_filename(sender_name)}_{sanitize_filename(subj)}_{unique_id[:8]}"
655+
# Use EntryID if available for unique naming, fallback to hash
656+
entry_id = getattr(message, 'EntryID', None)
657+
if entry_id:
658+
unique_id = hashlib.sha256(entry_id.encode('utf-8')).hexdigest()
659+
else:
660+
unique_id = hashlib.sha256((subj + body[:100]).encode('utf-8', errors='ignore')).hexdigest()
654661

662+
sender_name = "Unknown"
663+
try:
664+
if item_class == 43: # MailItem
665+
sender_name = getattr(message, 'SenderName', '') or getattr(message, 'SenderEmailAddress', 'Unknown')
666+
elif item_class == 26: # AppointmentItem
667+
sender_name = getattr(message, 'Organizer', 'Unknown')
668+
except: pass
669+
670+
folder_name = f"{sanitize_filename(sender_name)}_{sanitize_filename(subj)}_{unique_id[:8]}"
655671
save_folder = os.path.join(sandbox_path, "mail", folder_name)
672+
656673
if os.path.exists(save_folder):
657674
return None
658675

659676
os.makedirs(save_folder, exist_ok=True)
660677

661678
meta = {
662679
"id": unique_id,
680+
"EntryID": entry_id,
663681
"Subject": subj,
664682
"Body": body,
683+
"HTMLBody": html_body[:5000] if html_body else "", # Truncate HTML body for meta
665684
"ReceivedTime": str(getattr(message, 'ReceivedTime', '')),
666685
"Sender": sender_name,
667686
"To": getattr(message, 'To', ''),
668-
"FolderName": folder_name_source
687+
"FolderName": folder_name_source,
688+
"ItemClass": item_class
669689
}
670690

671691
with open(os.path.join(save_folder, "email_data.json"), 'w', encoding='utf-8') as f:
@@ -707,90 +727,96 @@ def _save_outlook_message(message, folder_name_source: str) -> str:
707727
conn = win32com.client.Dispatch("ADODB.Connection")
708728
conn.Open("Provider=Search.CollatorDSO;Extended Properties='Application=Windows';")
709729

710-
# SQL-like query for Windows Search
730+
# We fetch System.ItemUrl which contains the outlook:0000... protocol with EntryID
711731
sql = f"""
712-
SELECT System.ItemUrl FROM SystemIndex
732+
SELECT "System.ItemUrl" FROM SystemIndex
713733
WHERE System.Kind = 'email'
714734
AND CONTAINS(*, '"{query}"')
715735
"""
716736
if received_after:
717-
sql += f" AND System.DateModified >= '{received_after}'"
737+
# Windows Search date format is YYYY/MM/DD
738+
sql += f" AND System.DateModified >= '{received_after.replace('-', '/')}'"
718739
sql += " ORDER BY System.DateModified DESC"
719740

720741
rs = conn.Execute(sql)[0]
721742
while not rs.EOF:
722-
item_url = rs.Fields.Item(0).Value
743+
if processed_count >= 50: break
744+
item_url = rs.Fields.Item(0).Value # e.g. outlook:00000000...
723745
try:
724-
# GetItemFromID is better if we had EntryID, but ItemUrl works with some namespaces
725-
# Or we try to bond back via EntryID if we selected it.
726-
# For simplicity in this tool, we use the URL to find the item or fallback.
727-
pass # ADODB is great for finding IF we can map back to Outlook Object
746+
# Extract entry ID from URL if it starts with outlook:
747+
if item_url.startswith("outlook:"):
748+
entry_id = item_url.split("outlook:")[1]
749+
message = namespace.GetItemFromID(entry_id)
750+
path = _save_outlook_message(message, "WindowsIndex")
751+
if path: saved_paths.append(path)
728752
except: pass
729753
rs.MoveNext()
730754
rs.Close()
731755
conn.Close()
732756
except Exception:
733-
pass # Fallback to DASL
757+
pass
734758

735759
# 2. Strategy: DASL Filter (Outlook Native Search) - Medium Speed
736-
# We search in Inbox (6) and Sent (5) by default for efficiency
737-
targets = [namespace.GetDefaultFolder(6), namespace.GetDefaultFolder(5)]
738-
739-
dasl_query = ""
740-
if query:
741-
dasl_query = (
742-
f"@SQL=\"urn:schemas:httpmail:subject\" LIKE '%{query}%' OR "
743-
f"\"urn:schemas:httpmail:textdescription\" LIKE '%{query}%' OR "
744-
f"\"urn:schemas:httpmail:fromname\" LIKE '%{query}%'"
745-
)
746-
747-
# Add Date Filter to DASL
748-
if received_after:
749-
date_part = f"\"urn:schemas:httpmail:datereceived\" >= '{received_after} 00:00 AM'"
750-
if dasl_query:
751-
dasl_query = f"@SQL=({dasl_query.replace('@SQL=', '')}) AND {date_part}"
752-
else:
753-
dasl_query = f"@SQL={date_part}"
760+
if processed_count < 10: # Only if ADODB didn't yield much
761+
targets = [namespace.GetDefaultFolder(6), namespace.GetDefaultFolder(5)]
762+
763+
dasl_query = ""
764+
if query:
765+
dasl_query = (
766+
f"@SQL=\"urn:schemas:httpmail:subject\" LIKE '%{query}%' OR "
767+
f"\"urn:schemas:httpmail:textdescription\" LIKE '%{query}%' OR "
768+
f"\"urn:schemas:httpmail:fromname\" LIKE '%{query}%'"
769+
)
770+
771+
if received_after:
772+
try:
773+
dt_obj = datetime.strptime(received_after, "%Y-%m-%d")
774+
date_part = f"\"urn:schemas:httpmail:datereceived\" >= '{dt_obj.strftime('%m/%d/%Y')} 00:00 AM'"
775+
if dasl_query:
776+
dasl_query = f"@SQL=({dasl_query.replace('@SQL=', '')}) AND {date_part}"
777+
else:
778+
dasl_query = f"@SQL={date_part}"
779+
except: pass
754780

755-
for folder in targets:
756-
try:
757-
items = folder.Items
758-
if dasl_query:
759-
items = items.Restrict(dasl_query)
760-
761-
items.Sort("[ReceivedTime]", True)
762-
763-
count = 0
764-
for message in items:
765-
if count >= 50: break # Safety limit per folder
766-
path = _save_outlook_message(message, folder.Name)
767-
if path:
768-
saved_paths.append(path)
769-
count += 1
770-
except Exception:
771-
continue
781+
for folder in targets:
782+
if processed_count >= 50: break
783+
try:
784+
items = folder.Items
785+
if dasl_query:
786+
items = items.Restrict(dasl_query)
787+
items.Sort("[ReceivedTime]", True)
788+
789+
for message in items:
790+
if processed_count >= 50: break
791+
path = _save_outlook_message(message, folder.Name)
792+
if path: saved_paths.append(path)
793+
except Exception: continue
772794

773795
# 3. Strategy: Full Recursive Fallback (Current) - Slowest
774-
# Only if we found nothing and no error occurred
775796
if processed_count == 0:
776797
def process_folder_recursive(folder):
777798
for sub in folder.Folders:
799+
if processed_count >= 50: return
778800
process_folder_recursive(sub)
779801

780802
items = folder.Items
781-
if outlook_date_filter:
782-
try: items = items.Restrict(outlook_date_filter)
803+
# Use crude restriction for dates if possible
804+
if received_after:
805+
try:
806+
dt_obj = datetime.strptime(received_after, "%Y-%m-%d")
807+
local_filter = f"[ReceivedTime] >= '{dt_obj.strftime('%m/%d/%Y')} 00:00 AM'"
808+
items = items.Restrict(local_filter)
783809
except: pass
784810

785811
for msg in items:
812+
if processed_count >= 50: return
786813
if query:
787814
subj = getattr(msg, 'Subject', '') or ''
788815
body = getattr(msg, 'Body', '') or ''
789816
if not re.search(query, f"{subj} {body}", re.I): continue
790817

791818
path = _save_outlook_message(msg, folder.Name)
792819
if path: saved_paths.append(path)
793-
if processed_count >= 50: return
794820

795821
for account in namespace.Folders:
796822
if processed_count >= 50: break

0 commit comments

Comments
 (0)