Source code for src.openCHA.tasks.extract_text

import io
from typing import Any
from typing import List
from urllib.parse import urlparse

import requests
from openCHA.tasks import BaseTask
from pydantic import model_validator



[docs]
class ExtractText(BaseTask):
    """
    **Description:**

        This task extracts all the text from the current webpage.
    """

    name: str = "extract_text"
    chat_name: str = "ExtractText"
    description: str = "Extract all the text on the current webpage"
    dependencies: List[str] = []
    inputs: List[str] = [
        "url to extract the text from. It requires links which is gathered from other tools. Never provide urls on your own."
    ]
    outputs: List[str] = [
        "An string containing the text of the scraped webpage."
    ]
    output_type: bool = False
    sync_playwright: Any = None
    high_level: Any = None
    bs4: Any = None


[docs]
    @model_validator(mode="before")
    def check_acheck_bs_importrgs(cls, values: dict) -> dict:
        """
            Check that the arguments are valid.

        Args:
            values (Dict): The current attribute values.
        Return:
            Dict: The updated attribute values.
        Raise:
            ImportError: If 'beautifulsoup4', 'lxml', or 'pdfminer' packages are not installed.

        """

        try:
            from bs4 import BeautifulSoup  # noqa: F401

            values["bs4"] = BeautifulSoup
        except ImportError:
            raise ImportError(
                "The 'beautifulsoup4' package is required to use this tool."
                " Please install it with 'pip install beautifulsoup4'."
            )

        try:
            import lxml  # noqa: F401
        except ImportError:
            raise ImportError(
                "The 'lxml' package is required to use this tool."
                " Please install it with 'pip install lxml'."
            )

        try:
            from pdfminer import high_level  # noqa: F401

            values["high_level"] = high_level
        except ImportError:
            raise ImportError(
                "The 'pdfminer' package is required to use this tool."
                " Please install it with 'pip install pdfminer.six'."
            )

        try:
            from playwright.sync_api import sync_playwright

            values["sync_playwright"] = sync_playwright

        except ImportError:
            raise ImportError(
                "The 'playwright' package is required to use this tool."
                " Please install it with 'pip install playwright'."
            )
        return values



[docs]
    def validate_url(self, url):
        """
            This method validates a given URL by checking if its scheme is either 'http' or 'https'.

        Args:
            url (str): The URL to be validated.
        Return:
            str: The validated URL.
        Raise:
            ValueError: If the URL scheme is not 'http' or 'https'.


        """

        parsed_url = urlparse(url)
        if parsed_url.scheme not in ("http", "https"):
            raise ValueError("URL scheme must be 'http' or 'https'")
        return url



[docs]
    def _execute(
        self,
        inputs: List[Any],
    ) -> str:
        """
            Execute the ExtractText task.

        Args:
            input (str): The input parameter for the task.
        Return:
            str: The extracted text from the current webpage.
        Raise:
            ValueError: If the synchronous browser is not provided.

        """

        self.validate_url(inputs[0].strip())

        if inputs[0].lower().endswith(".pdf"):
            # Request the PDF content from the URL
            response = requests.get(inputs[0])
            if response.status_code == 200:
                # Use BytesIO to create an in-memory stream
                pdf_stream = io.BytesIO(response.content)
                # Extract text from the PDF stream
                text = self.high_level.extract_text(pdf_stream)
                # Wrap text in basic HTML tags
                html_content = (
                    f"<html><body><p>{text}</p></body></html>"
                )
                # Parse the HTML content with BeautifulSoup
                soup = self.bs4(html_content, "lxml")
                return " ".join(
                    text for text in soup.stripped_strings
                )
            else:
                return "Error extracting text. The url is wrong. Try again."
        else:
            with self.sync_playwright() as playwright:
                chromium = (
                    playwright.chromium
                )  # or "firefox" or "webkit".
                browser = chromium.launch()
                page = browser.new_page()
                response = page.goto(inputs[0])
                status = response.status if response else "unknown"

                if status == 200:
                    html_content = page.content()
                    # Parse the HTML content with BeautifulSoup
                    soup = self.bs4(html_content, "lxml")
                    page.close()
                    browser.close()
                    return " ".join(
                        text for text in soup.stripped_strings
                    )
                else:
                    page.close()
                    browser.close()
                    return "Error extracting text. The url is wrong. Try again."



[docs]
    def explain(
        self,
    ) -> str:
        """
            Explain the ExtractText task.

        Return:
            str: A brief explanation of the ExtractText task.


        """

        return "This task returns the ulr of the current page."