Source code for src.openCHA.tasks.extract_text
import io
from typing import Any
from typing import List
from urllib.parse import urlparse
import requests
from openCHA.tasks import BaseTask
from pydantic import model_validator
[docs]
class ExtractText(BaseTask):
"""
**Description:**
This task extracts all the text from the current webpage.
"""
name: str = "extract_text"
chat_name: str = "ExtractText"
description: str = "Extract all the text on the current webpage"
dependencies: List[str] = []
inputs: List[str] = [
"url to extract the text from. It requires links which is gathered from other tools. Never provide urls on your own."
]
outputs: List[str] = [
"An string containing the text of the scraped webpage."
]
output_type: bool = False
sync_playwright: Any = None
high_level: Any = None
bs4: Any = None
[docs]
@model_validator(mode="before")
def check_acheck_bs_importrgs(cls, values: dict) -> dict:
"""
Check that the arguments are valid.
Args:
values (Dict): The current attribute values.
Return:
Dict: The updated attribute values.
Raise:
ImportError: If 'beautifulsoup4', 'lxml', or 'pdfminer' packages are not installed.
"""
try:
from bs4 import BeautifulSoup # noqa: F401
values["bs4"] = BeautifulSoup
except ImportError:
raise ImportError(
"The 'beautifulsoup4' package is required to use this tool."
" Please install it with 'pip install beautifulsoup4'."
)
try:
import lxml # noqa: F401
except ImportError:
raise ImportError(
"The 'lxml' package is required to use this tool."
" Please install it with 'pip install lxml'."
)
try:
from pdfminer import high_level # noqa: F401
values["high_level"] = high_level
except ImportError:
raise ImportError(
"The 'pdfminer' package is required to use this tool."
" Please install it with 'pip install pdfminer.six'."
)
try:
from playwright.sync_api import sync_playwright
values["sync_playwright"] = sync_playwright
except ImportError:
raise ImportError(
"The 'playwright' package is required to use this tool."
" Please install it with 'pip install playwright'."
)
return values
[docs]
def validate_url(self, url):
"""
This method validates a given URL by checking if its scheme is either 'http' or 'https'.
Args:
url (str): The URL to be validated.
Return:
str: The validated URL.
Raise:
ValueError: If the URL scheme is not 'http' or 'https'.
"""
parsed_url = urlparse(url)
if parsed_url.scheme not in ("http", "https"):
raise ValueError("URL scheme must be 'http' or 'https'")
return url
[docs]
def _execute(
self,
inputs: List[Any],
) -> str:
"""
Execute the ExtractText task.
Args:
input (str): The input parameter for the task.
Return:
str: The extracted text from the current webpage.
Raise:
ValueError: If the synchronous browser is not provided.
"""
self.validate_url(inputs[0].strip())
if inputs[0].lower().endswith(".pdf"):
# Request the PDF content from the URL
response = requests.get(inputs[0])
if response.status_code == 200:
# Use BytesIO to create an in-memory stream
pdf_stream = io.BytesIO(response.content)
# Extract text from the PDF stream
text = self.high_level.extract_text(pdf_stream)
# Wrap text in basic HTML tags
html_content = (
f"<html><body><p>{text}</p></body></html>"
)
# Parse the HTML content with BeautifulSoup
soup = self.bs4(html_content, "lxml")
return " ".join(
text for text in soup.stripped_strings
)
else:
return "Error extracting text. The url is wrong. Try again."
else:
with self.sync_playwright() as playwright:
chromium = (
playwright.chromium
) # or "firefox" or "webkit".
browser = chromium.launch()
page = browser.new_page()
response = page.goto(inputs[0])
status = response.status if response else "unknown"
if status == 200:
html_content = page.content()
# Parse the HTML content with BeautifulSoup
soup = self.bs4(html_content, "lxml")
page.close()
browser.close()
return " ".join(
text for text in soup.stripped_strings
)
else:
page.close()
browser.close()
return "Error extracting text. The url is wrong. Try again."
[docs]
def explain(
self,
) -> str:
"""
Explain the ExtractText task.
Return:
str: A brief explanation of the ExtractText task.
"""
return "This task returns the ulr of the current page."