import re
import json
from typing import List, Optional

def parse_citation(citation: str) -> Optional[dict]:
    # Remove leading index (e.g., "1:" or "1.") and clean up extra whitespace
    citation = re.sub(r"^\d+[:\.]\s*", "", citation).replace("\n", " ").strip()
    
    # Regular expressions to handle different citation formats
    patterns = [
        # PubMed-like style: Authors. Title. Journal. Year;Volume(Issue):Pages. doi:DOI
        re.compile(
            r"(?P<authors>.+?)\.\s(?P<title>.+?)\.\s(?P<journal>.+?)\.\s*"
            r"(?P<year>\d{4})(?:\s(?P<month>[A-Za-z]+)(?:\s(?P<day>\d+))?)?\s*[,;:]?\s*"
            r"(?:(?P<volume>\d+)(?:\((?P<issue>\d+)\))?)?\s*[-:;]?\s*(?P<pages>\d+(?:[-–]\d+)?)?\s*\.?\s*",
            re.IGNORECASE
        ),
        # APA-like style: Authors. Title. Journal. Year;Volume(Issue):Pages (no DOI or optional DOI)
        re.compile(
            r"(?P<authors>.+?)\.\s(?P<title>.+?)\.\s(?P<journal>.+?)\.\s*"
            r"(?P<year>\d{4})\s*[,;:]?\s*(?P<volume>\d+)?(?:\((?P<issue>\d+)\))?[,;:]?\s*"
            r"(?P<pages>\d+(?:[-–]\d+)?)?\s*\.?\s*",
            re.IGNORECASE
        ),
        # Title-first style: Title. Journal. Year;Volume(Issue):Pages (no authors)
        re.compile(
            r"(?P<title>.+?)\.\s(?P<journal>.+?)\.\s*"
            r"(?P<year>\d{4})\s*[,;:]?\s*(?P<volume>\d+)?(?:\((?P<issue>\d+)\))?[,;:]?\s*"
            r"(?P<pages>\d+(?:[-–]\d+)?)?\s*\.?\s*",
            re.IGNORECASE
        ),
        # Fallback: Minimal parsing (Authors. Title. Journal. Year or Title. Journal. Year)
        re.compile(
            r"(?:(?P<authors>.+?)\.\s)?(?P<title>.+?)\.\s(?P<journal>.+?)\.\s*"
            r"(?P<year>\d{4})\s*",
            re.IGNORECASE
        ),
    ]
    
    # Separate DOI extraction
    doi_pattern = re.compile(r"(?:doi:|DOI:\s*|https?://doi\.org/)(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)", re.IGNORECASE)
    doi_match = doi_pattern.search(citation)
    doi = doi_match.group(1) if doi_match else "N/A"
    
    for pattern in patterns:
        match = pattern.search(citation)
        if match:
            data = match.groupdict()
            # Dynamically format the "Published" field
            year = data.get("year", "N/A")
            month = data.get("month", None)
            day = data.get("day", None)
            published = year
            if month:
                published += f" {month}"
                if day:
                    published += f" {day}"
            
            pubmed_link = f"https://pubmed.ncbi.nlm.nih.gov/?term={doi}" if doi != "N/A" else "N/A"
            return {
                "Title": data.get("title", "N/A"),
                "Authors": data.get("authors", "N/A") if data.get("authors") else "N/A",
                "Published": published,
                "Journal": data.get("journal", "N/A"),
                "Volume": data.get("volume", "N/A") if data.get("volume") else "N/A",
                "Issue": data.get("issue", "N/A") if data.get("issue") else "N/A",
                "Pages": data.get("pages", "N/A") if data.get("pages") else "N/A",
                "PubMed_Link": pubmed_link,
                "doi": doi
            }
    return None

def read_file_with_encoding(file_path: str) -> Optional[List[str]]:
    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.readlines()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Could not read file with any of the following encodings: {', '.join(encodings)}")

def preprocess_citations(lines: List[str]) -> List[str]:
    # Combine multi-line citations into single strings based on index (e.g., "1:", "1.")
    citations = []
    current_citation = ""
    
    for line in lines:
        line = line.strip()
        if re.match(r"^\d+[:\.]\s*", line):  # Start of a new citation with : or .
            if current_citation:
                citations.append(current_citation)
            current_citation = line
        elif line and not line.startswith("Histopathological Changes"):  # Continuation, skip header
            current_citation += " " + line
    
    if current_citation:
        citations.append(current_citation)
    
    return citations

def convert_citations_to_json(input_file: str, output_file: str, research_idea: str) -> None:
    articles = []
    try:
        lines = read_file_with_encoding(input_file)
        if not lines:
            raise ValueError("No content could be read from the file")
        
        citations = preprocess_citations(lines)
        
    except (IOError, ValueError) as e:
        print(f"Error reading input file: {e}")
        return
    
    for citation in citations:
        if citation:
            article = parse_citation(citation)
            if article:
                articles.append(article)
            else:
                print(f"Warning: Could not parse citation: {citation}")
    
    output_json = {
        "Research Idea": research_idea,
        "PubMed Articles": articles
    }
    
    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output_json, file, indent=4)
        print(f"Successfully wrote {len(articles)} articles to {output_file}")
    except IOError as e:
        print(f"Error writing to output file: {e}")

# Example usage
if __name__ == "__main__":
    input_file = "citations.txt"
    output_file = "output.json"
    research_idea = "Cyclophotocoagulation Techniques in Glaucoma Treatment"
    
    # Save your citations to citations.txt exactly as provided
    convert_citations_to_json(input_file, output_file, research_idea)