#!/usr/bin/env python3
"""Text Statistics Script - Analyzes /etc/hostname"""

import json
import re
from collections import Counter

# Common stopwords to exclude
STOPWORDS = {
    'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
    'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
    'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
    'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
    'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'you', 'he',
    'she', 'we', 'they', 'what', 'which', 'who', 'whom', 'whose', 'where',
    'when', 'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more',
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
    'same', 'so', 'than', 'too', 'very', 'just', 'also'
}

def analyze_text(filepath):
    """Analyze text file and return statistics"""
    # 1. Read file
    with open(filepath, 'r') as f:
        content = f.read()
    
    # 2. Count words, chars, lines
    lines = content.count('\n') + (1 if content and not content.endswith('\n') else 0)
    words = re.findall(r'\b\w+\b', content.lower())
    char_count = len(content)
    
    # 3. Find top 5 common words (excluding stopwords)
    filtered_words = [w for w in words if w not in STOPWORDS and len(w) > 1]
    word_counts = Counter(filtered_words)
    top_5_words = word_counts.most_common(5)
    
    # 4. Calculate average word length
    avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
    
    # 5. Build result
    result = {
        'file': filepath,
        'statistics': {
            'lines': lines,
            'words': len(words),
            'characters': char_count
        },
        'top_5_common_words': [{'word': w, 'count': c} for w, c in top_5_words],
        'average_word_length': round(avg_word_length, 2)
    }
    
    return result

if __name__ == '__main__':
    result = analyze_text('/etc/hostname')
    print(json.dumps(result, indent=2))