@@ -644,28 +644,48 @@ def _save_outlook_message(message, folder_name_source: str) -> str:
644644 """Helper to save an Outlook message and its attachments."""
645645 nonlocal processed_count
646646 try :
647- # Generate unique ID based on subject and body
648- subj = getattr (message , 'Subject' , '' ) or ''
647+ # Filter for emails specifically if possible, but handle others
648+ # olMail=43, olAppointment=26, olMeetingRequest=53
649+ item_class = getattr (message , 'Class' , 43 )
650+
651+ subj = getattr (message , 'Subject' , 'No Subject' ) or 'No Subject'
649652 body = getattr (message , 'Body' , '' ) or ''
650- unique_id = hashlib . sha256 (( subj + body ). encode ( 'utf-8 ' , errors = 'ignore' )). hexdigest ()
653+ html_body = getattr ( message , 'HTMLBody ' , '' ) or ''
651654
652- sender_name = getattr (message , 'SenderName' , 'Unknown' )
653- folder_name = f"{ sanitize_filename (sender_name )} _{ sanitize_filename (subj )} _{ unique_id [:8 ]} "
655+ # Use EntryID if available for unique naming, fallback to hash
656+ entry_id = getattr (message , 'EntryID' , None )
657+ if entry_id :
658+ unique_id = hashlib .sha256 (entry_id .encode ('utf-8' )).hexdigest ()
659+ else :
660+ unique_id = hashlib .sha256 ((subj + body [:100 ]).encode ('utf-8' , errors = 'ignore' )).hexdigest ()
654661
662+ sender_name = "Unknown"
663+ try :
664+ if item_class == 43 : # MailItem
665+ sender_name = getattr (message , 'SenderName' , '' ) or getattr (message , 'SenderEmailAddress' , 'Unknown' )
666+ elif item_class == 26 : # AppointmentItem
667+ sender_name = getattr (message , 'Organizer' , 'Unknown' )
668+ except : pass
669+
670+ folder_name = f"{ sanitize_filename (sender_name )} _{ sanitize_filename (subj )} _{ unique_id [:8 ]} "
655671 save_folder = os .path .join (sandbox_path , "mail" , folder_name )
672+
656673 if os .path .exists (save_folder ):
657674 return None
658675
659676 os .makedirs (save_folder , exist_ok = True )
660677
661678 meta = {
662679 "id" : unique_id ,
680+ "EntryID" : entry_id ,
663681 "Subject" : subj ,
664682 "Body" : body ,
683+ "HTMLBody" : html_body [:5000 ] if html_body else "" , # Truncate HTML body for meta
665684 "ReceivedTime" : str (getattr (message , 'ReceivedTime' , '' )),
666685 "Sender" : sender_name ,
667686 "To" : getattr (message , 'To' , '' ),
668- "FolderName" : folder_name_source
687+ "FolderName" : folder_name_source ,
688+ "ItemClass" : item_class
669689 }
670690
671691 with open (os .path .join (save_folder , "email_data.json" ), 'w' , encoding = 'utf-8' ) as f :
@@ -707,90 +727,96 @@ def _save_outlook_message(message, folder_name_source: str) -> str:
707727 conn = win32com .client .Dispatch ("ADODB.Connection" )
708728 conn .Open ("Provider=Search.CollatorDSO;Extended Properties='Application=Windows';" )
709729
710- # SQL-like query for Windows Search
730+ # We fetch System.ItemUrl which contains the outlook:0000... protocol with EntryID
711731 sql = f"""
712- SELECT System.ItemUrl FROM SystemIndex
732+ SELECT " System.ItemUrl" FROM SystemIndex
713733 WHERE System.Kind = 'email'
714734 AND CONTAINS(*, '"{ query } "')
715735 """
716736 if received_after :
717- sql += f" AND System.DateModified >= '{ received_after } '"
737+ # Windows Search date format is YYYY/MM/DD
738+ sql += f" AND System.DateModified >= '{ received_after .replace ('-' , '/' )} '"
718739 sql += " ORDER BY System.DateModified DESC"
719740
720741 rs = conn .Execute (sql )[0 ]
721742 while not rs .EOF :
722- item_url = rs .Fields .Item (0 ).Value
743+ if processed_count >= 50 : break
744+ item_url = rs .Fields .Item (0 ).Value # e.g. outlook:00000000...
723745 try :
724- # GetItemFromID is better if we had EntryID, but ItemUrl works with some namespaces
725- # Or we try to bond back via EntryID if we selected it.
726- # For simplicity in this tool, we use the URL to find the item or fallback.
727- pass # ADODB is great for finding IF we can map back to Outlook Object
746+ # Extract entry ID from URL if it starts with outlook:
747+ if item_url .startswith ("outlook:" ):
748+ entry_id = item_url .split ("outlook:" )[1 ]
749+ message = namespace .GetItemFromID (entry_id )
750+ path = _save_outlook_message (message , "WindowsIndex" )
751+ if path : saved_paths .append (path )
728752 except : pass
729753 rs .MoveNext ()
730754 rs .Close ()
731755 conn .Close ()
732756 except Exception :
733- pass # Fallback to DASL
757+ pass
734758
735759 # 2. Strategy: DASL Filter (Outlook Native Search) - Medium Speed
736- # We search in Inbox (6) and Sent (5) by default for efficiency
737- targets = [namespace .GetDefaultFolder (6 ), namespace .GetDefaultFolder (5 )]
738-
739- dasl_query = ""
740- if query :
741- dasl_query = (
742- f"@SQL=\" urn:schemas:httpmail:subject\" LIKE '%{ query } %' OR "
743- f"\" urn:schemas:httpmail:textdescription\" LIKE '%{ query } %' OR "
744- f"\" urn:schemas:httpmail:fromname\" LIKE '%{ query } %'"
745- )
746-
747- # Add Date Filter to DASL
748- if received_after :
749- date_part = f"\" urn:schemas:httpmail:datereceived\" >= '{ received_after } 00:00 AM'"
750- if dasl_query :
751- dasl_query = f"@SQL=({ dasl_query .replace ('@SQL=' , '' )} ) AND { date_part } "
752- else :
753- dasl_query = f"@SQL={ date_part } "
760+ if processed_count < 10 : # Only if ADODB didn't yield much
761+ targets = [namespace .GetDefaultFolder (6 ), namespace .GetDefaultFolder (5 )]
762+
763+ dasl_query = ""
764+ if query :
765+ dasl_query = (
766+ f"@SQL=\" urn:schemas:httpmail:subject\" LIKE '%{ query } %' OR "
767+ f"\" urn:schemas:httpmail:textdescription\" LIKE '%{ query } %' OR "
768+ f"\" urn:schemas:httpmail:fromname\" LIKE '%{ query } %'"
769+ )
770+
771+ if received_after :
772+ try :
773+ dt_obj = datetime .strptime (received_after , "%Y-%m-%d" )
774+ date_part = f"\" urn:schemas:httpmail:datereceived\" >= '{ dt_obj .strftime ('%m/%d/%Y' )} 00:00 AM'"
775+ if dasl_query :
776+ dasl_query = f"@SQL=({ dasl_query .replace ('@SQL=' , '' )} ) AND { date_part } "
777+ else :
778+ dasl_query = f"@SQL={ date_part } "
779+ except : pass
754780
755- for folder in targets :
756- try :
757- items = folder .Items
758- if dasl_query :
759- items = items .Restrict (dasl_query )
760-
761- items .Sort ("[ReceivedTime]" , True )
762-
763- count = 0
764- for message in items :
765- if count >= 50 : break # Safety limit per folder
766- path = _save_outlook_message (message , folder .Name )
767- if path :
768- saved_paths .append (path )
769- count += 1
770- except Exception :
771- continue
781+ for folder in targets :
782+ if processed_count >= 50 : break
783+ try :
784+ items = folder .Items
785+ if dasl_query :
786+ items = items .Restrict (dasl_query )
787+ items .Sort ("[ReceivedTime]" , True )
788+
789+ for message in items :
790+ if processed_count >= 50 : break
791+ path = _save_outlook_message (message , folder .Name )
792+ if path : saved_paths .append (path )
793+ except Exception : continue
772794
773795 # 3. Strategy: Full Recursive Fallback (Current) - Slowest
774- # Only if we found nothing and no error occurred
775796 if processed_count == 0 :
776797 def process_folder_recursive (folder ):
777798 for sub in folder .Folders :
799+ if processed_count >= 50 : return
778800 process_folder_recursive (sub )
779801
780802 items = folder .Items
781- if outlook_date_filter :
782- try : items = items .Restrict (outlook_date_filter )
803+ # Use crude restriction for dates if possible
804+ if received_after :
805+ try :
806+ dt_obj = datetime .strptime (received_after , "%Y-%m-%d" )
807+ local_filter = f"[ReceivedTime] >= '{ dt_obj .strftime ('%m/%d/%Y' )} 00:00 AM'"
808+ items = items .Restrict (local_filter )
783809 except : pass
784810
785811 for msg in items :
812+ if processed_count >= 50 : return
786813 if query :
787814 subj = getattr (msg , 'Subject' , '' ) or ''
788815 body = getattr (msg , 'Body' , '' ) or ''
789816 if not re .search (query , f"{ subj } { body } " , re .I ): continue
790817
791818 path = _save_outlook_message (msg , folder .Name )
792819 if path : saved_paths .append (path )
793- if processed_count >= 50 : return
794820
795821 for account in namespace .Folders :
796822 if processed_count >= 50 : break
0 commit comments