#!/usr/bin/env python
import sys
import mwxml
import os
import re

def sanitize_filename(s):
    """Sanitize string to be a safe filename."""
    return re.sub(r'[\/\\:*?"<>|]', '_', s).strip()[:100]

def save_to_txt(title, author, date, text):
    """Save the article text to a file named using title, author, and date."""
    filename = f"{sanitize_filename(author)}_{sanitize_filename(title)}_{date}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)

def main():
    if len(sys.argv) != 2:
        print("Usage: python wikidump2txt.py <xml file>")
        sys.exit(1)

    xml_file = sys.argv[1]
    
    # Open and parse the dump using mwxml
    dump = mwxml.Dump.from_file(open(xml_file, 'rb'))
    
    total_pages = 0
    for page in dump.pages:
        total_pages += 1

        # Get the last revision (assumed to be the most recent)
        last_revision = None
        for revision in page:
            last_revision = revision
        if last_revision is None:
            continue

        title = page.title
        text = last_revision.text or ""

        # Use getattr to safely fetch the contributor attribute.
        #contributor = getattr(last_revision, 'username', None)
        
        author = last_revision.user.text
        #author = contributor.username if contributor and hasattr(contributor, 'username') else "Unknown"

        date = (last_revision.timestamp.strftime("%Y-%m-%d") 
                if last_revision.timestamp and hasattr(last_revision.timestamp, "strftime") 
                else "UnknownDate")

        print("Title:", title)
        print("Author:", author)
        print("Date:", date)
        print("Text snippet:", text[:200] + ("..." if len(text) > 200 else ""))
        print("=" * 80)

        save_to_txt(title, author, date, text)
    
    print("Total pages:", total_pages)

if __name__ == "__main__":
    main()
