confluence_api/confluence_page_dates.py at main · sladehouseltd/confluence_api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/usr/bin/env python3

import argparse
import csv
import os
import sys
from datetime import datetime
from getpass import getpass
from typing import List, Dict, Optional

import requests
from requests.auth import HTTPBasicAuth


class ConfluencePageAnalyzer:
    def __init__(self, base_url: str, username: str, password: str):
        self.base_url = base_url.rstrip('/')
        self.auth = HTTPBasicAuth(username, password)
        self.session = requests.Session()
        self.session.auth = self.auth

    def get_all_pages_in_space(self, space_key: str) -> List[Dict]:
        """Retrieve all pages from a Confluence space."""
        pages = []
        start = 0
        limit = 50

        print("Fetching pages from Confluence...")
        while True:
            url = f"{self.base_url}/rest/api/content"
            params = {
                'spaceKey': space_key,
                'type': 'page',
                'start': start,
                'limit': limit,
                'expand': 'version,history.lastUpdated,space,_links'
            }

            response = self.session.get(url, params=params)
            response.raise_for_status()

            data = response.json()
            pages.extend(data['results'])
            print(f"Fetched {len(pages)} pages so far...")

            if len(data['results']) < limit:
                break

            start += limit

        return pages

    def get_page_analytics(self, page_id: str) -> Optional[Dict]:
        """Get analytics data for a specific page."""
        try:
            # Try the analytics endpoint first
            url = f"{self.base_url}/rest/api/analytics/content/{page_id}/views"
            response = self.session.get(url)

            if response.status_code == 200:
                data = response.json()
                if data:  # Check if data is not empty
                    return data

            # If analytics fails, try the audit log approach (Confluence Cloud)
            audit_url = f"{self.base_url}/rest/api/audit"
            audit_params = {
                'searchString': page_id,
                'limit': 1
            }
            audit_response = self.session.get(audit_url, params=audit_params)

            if audit_response.status_code == 200:
                audit_data = audit_response.json()
                if audit_data.get('results'):
                    return {'audit': audit_data['results'][0]}

            return None
        except Exception:
            return None

    def analyze_pages(self, space_key: str, include_modified: bool, include_viewed: bool) -> List[Dict]:
        """Analyze pages in a space for modification and view dates."""
        pages = self.get_all_pages_in_space(space_key)
        results = []
        total_pages = len(pages)
        print(f"Found {total_pages} pages to analyze...")

        for i, page in enumerate(pages, 1):
            # Construct page URL using web UI link if available, fallback to page ID
            if '_links' in page and 'webui' in page['_links']:
                page_url = f"{self.base_url}{page['_links']['webui']}"
            else:
                page_url = f"{self.base_url}/display/{page['space']['key']}/{page['id']}"

            page_data = {
                'page': page['title'],
                'page_url': page_url,
                'page_id': page['id']
            }

            if include_modified:
                # Get last modified date from version history
                if 'version' in page:
                    modified_date = page['version']['when']
                    page_data['date_modified'] = datetime.fromisoformat(
                        modified_date.replace('Z', '+00:00')
                    ).strftime('%Y-%m-%d %H:%M:%S')
                else:
                    page_data['date_modified'] = 'N/A'

            if include_viewed:
                # Get last viewed date from analytics
                analytics = self.get_page_analytics(page['id'])
                if analytics:
                    if 'views' in analytics and analytics['views']:
                        # Standard analytics response
                        latest_view = max(analytics['views'], key=lambda x: x['date'])
                        page_data['date_viewed'] = latest_view['date']
                    elif 'audit' in analytics:
                        # Audit log response
                        audit_entry = analytics['audit']
                        if 'creationDate' in audit_entry:
                            page_data['date_viewed'] = audit_entry['creationDate']
                        else:
                            page_data['date_viewed'] = 'N/A'
                    else:
                        # Try to use any date field available
                        for date_field in ['lastViewDate', 'lastAccessed', 'viewDate', 'date']:
                            if date_field in analytics:
                                page_data['date_viewed'] = analytics[date_field]
                                break
                        else:
                            page_data['date_viewed'] = 'N/A'
                else:
                    page_data['date_viewed'] = 'N/A'

            results.append(page_data)

            # Progress update every 50 pages
            if i % 50 == 0:
                print(f"Analyzed {i}/{total_pages} pages...")

        print(f"Analysis complete - processed {total_pages} pages")
        return results

    def write_csv(self, data: List[Dict], filename: str, include_modified: bool, include_viewed: bool):
        """Write results to CSV file."""
        if not data:
            print("No data to write.")
            return

        # Determine columns and sort key
        columns = ['page', 'page_url']
        sort_key = None

        if include_modified and include_viewed:
            columns.extend(['date_modified', 'date_viewed'])
            sort_key = 'date_modified'  # Default to modified date for sorting
        elif include_modified:
            columns.append('date_modified')
            sort_key = 'date_modified'
        elif include_viewed:
            columns.append('date_viewed')
            sort_key = 'date_viewed'

        # Sort data by date descending (handle N/A values)
        if sort_key:
            def sort_func(item):
                date_str = item.get(sort_key, 'N/A')
                if date_str == 'N/A':
                    return datetime.min
                try:
                    if 'T' in date_str:
                        return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
                    else:
                        return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
                except ValueError:
                    return datetime.min

            data.sort(key=sort_func, reverse=True)

        # Write CSV
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=columns)
            writer.writeheader()

            for row in data:
                # Only write columns that are defined
                filtered_row = {col: row.get(col, '') for col in columns}
                writer.writerow(filtered_row)

        print(f"Results written to {filename}")


def load_config():
    """Load configuration from .env file or environment variables."""
    config = {}

    # Try to load from .env file
    env_file = '.env'
    if os.path.exists(env_file):
        with open(env_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, value = line.split('=', 1)
                    config[key.strip()] = value.strip().strip('"\'')

    # Override with environment variables
    config['CONFLUENCE_URL'] = os.getenv('CONFLUENCE_URL', config.get('CONFLUENCE_URL', ''))
    config['CONFLUENCE_USERNAME'] = os.getenv('CONFLUENCE_USERNAME', config.get('CONFLUENCE_USERNAME', ''))
    config['CONFLUENCE_PASSWORD'] = os.getenv('CONFLUENCE_PASSWORD', config.get('CONFLUENCE_PASSWORD', ''))

    return config


def get_credentials():
    """Get Confluence credentials from config or user input."""
    config = load_config()

    base_url = config.get('CONFLUENCE_URL', '').strip()
    username = config.get('CONFLUENCE_USERNAME', '').strip()
    password = config.get('CONFLUENCE_PASSWORD', '').strip()

    if not base_url:
        base_url = input("Enter Confluence URL: ").strip()

    if not username:
        username = input("Enter username: ").strip()

    if not password:
        password = getpass("Enter password: ")

    return base_url, username, password


def main():
    parser = argparse.ArgumentParser(
        description='Analyze Confluence page modification and view dates'
    )
    parser.add_argument('space', help='Confluence space key')
    parser.add_argument('--date-modified', action='store_true',
                      help='Include last modified dates in output')
    parser.add_argument('--date-viewed', action='store_true',
                      help='Include last viewed dates in output')
    parser.add_argument('--output', '-o', default=None,
                      help='Output CSV filename (default: auto-generated)')

    args = parser.parse_args()

    if not args.date_modified and not args.date_viewed:
        print("Error: You must specify at least one of --date-modified or --date-viewed")
        sys.exit(1)

    try:
        base_url, username, password = get_credentials()
        analyzer = ConfluencePageAnalyzer(base_url, username, password)

        print(f"Analyzing pages in space: {args.space}")
        results = analyzer.analyze_pages(args.space, args.date_modified, args.date_viewed)

        if not results:
            print("No pages found in the specified space.")
            return

        # Generate filename if not provided
        if args.output is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"confluence_pages_{args.space}_{timestamp}.csv"
        else:
            filename = args.output

        analyzer.write_csv(results, filename, args.date_modified, args.date_viewed)
        print(f"Analysis complete. Found {len(results)} pages.")

    except requests.exceptions.RequestException as e:
        print(f"Error connecting to Confluence: {e}")
        sys.exit(1)
    except KeyboardInterrupt:
        print("\nOperation cancelled by user.")
        sys.exit(1)
    except Exception as e:
        print(f"An error occurred: {e}")
        sys.exit(1)


if __name__ == '__main__':
    main()