from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import json

def scrape_product_page(url):
    with sync_playwright() as p:
        # Launch browser
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url)
        
        # Wait for network activity to stabilize
        page.wait_for_load_state("networkidle")
        
        # Get the rendered HTML
        content = page.content()
        browser.close()

    # Parse with BeautifulSoup
    soup = BeautifulSoup(content, "html.parser")
    
    # Extract page sections dynamically
    sections = {}
    for section in soup.find_all(["div", "section"]):
        text = section.get_text(strip=True)
        if len(text) > 50:  # Filter meaningful content
            sections[section.get("class", ["Unknown"])[0]] = text

    #Extract text
    text_content = soup.get_text()

    # Extract specific product details (example heuristics)
    data = {
        "title": soup.find("div.product-details__title h2").get_text(strip=True) if soup.find("div.product-details__title h2") else "N/A",
        "brand": soup.find("div.product-details__title h1").get_text(strip=True) if soup.find("div.product-details__title h1") else "N/A",
        "price": soup.find(class_="price").get_text(strip=True) if soup.find(class_="price") else "N/A",
        "description": soup.find("p").get_text(strip=True) if soup.find("p") else "N/A",
    }

    return {"sections": sections, "text_content": text_content, "data": data}

# Example Usage
url = "https://www.russellandbromley.co.uk/dartmouth/241888"
result = scrape_product_page(url)

# Save results
with open("output.json", "w") as f:
    json.dump(result, f, indent=4)

print("Scraping complete. Data saved to output.json.")
