Module osbot_utils.helpers.html.Html_To_Dict

Expand source code
from html.parser import HTMLParser

class Html_To_Dict(HTMLParser):
    def __init__(self, html):
        super().__init__()
        self.root    = None            # No root initially
        self.current = None         # No current node at the start
        self.stack   = []             # Empty stack for hierarchy management
        self.html    = html or ''

    def convert(self):
        self.feed(self.html)
        return self.root

    def handle_starttag(self, tag, attrs):
        new_tag = {"tag": tag, "attrs": dict(attrs), "children": [], "data": ""}

        if self.current is None:
            # When the first tag is encountered, it becomes the root
            self.root = new_tag
            self.current = new_tag
        else:
            # Otherwise, append the new tag as a child of the current tag
            self.current["children"].append(new_tag)

        # Update the stack and current pointers
        self.stack.append(new_tag)
        self.current = new_tag

    def handle_endtag(self, tag):
        if len(self.stack) > 1:
            self.stack.pop()
            self.current = self.stack[-1]

    def handle_data(self, data):
        if data.strip():  # Ignore whitespace
            if "data" in self.current:
                self.current["data"] += data
            else:
                self.current["data"] = data

    def print__generate_lines(self, node, indent="", last=True, is_root=True):
        lines = []

        prefix = "" if is_root else ("└── " if last else "├── ")

        tag = node.get("tag")
        attrs = node.get("attrs", {})
        children = node.get("children", [])
        attrs_str = ' '.join(f'{key}="{value}"' for key, value in attrs.items())
        attrs_str = f' ({attrs_str})' if attrs_str else ''

        lines.append(f"{indent}{prefix}{tag}{attrs_str}")

        child_indent = indent + ("    " if last else "│   ")

        for i, child in enumerate(children):
            is_last = i == len(children) - 1
            child_lines = self.print__generate_lines(child, indent=child_indent, last=is_last, is_root=False)
            lines.extend(child_lines if isinstance(child_lines, list) else [child_lines])

        return lines if is_root else "\n".join(lines)

    def print(self, just_return_lines=False):
        if self.root:
            lines = self.print__generate_lines(self.root, is_root=True)
            if just_return_lines:
                return lines
            else:
                self.print__lines(lines)
        return self

    def print__lines(self, lines):
        for line in lines:
            print(line)

Classes

class Html_To_Dict (html)

Find tags and other markup and call handler functions.

Usage

p = HTMLParser() p.feed(data) … p.close()

Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument.

Initialize and reset this instance.

If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters.

Expand source code
class Html_To_Dict(HTMLParser):
    def __init__(self, html):
        super().__init__()
        self.root    = None            # No root initially
        self.current = None         # No current node at the start
        self.stack   = []             # Empty stack for hierarchy management
        self.html    = html or ''

    def convert(self):
        self.feed(self.html)
        return self.root

    def handle_starttag(self, tag, attrs):
        new_tag = {"tag": tag, "attrs": dict(attrs), "children": [], "data": ""}

        if self.current is None:
            # When the first tag is encountered, it becomes the root
            self.root = new_tag
            self.current = new_tag
        else:
            # Otherwise, append the new tag as a child of the current tag
            self.current["children"].append(new_tag)

        # Update the stack and current pointers
        self.stack.append(new_tag)
        self.current = new_tag

    def handle_endtag(self, tag):
        if len(self.stack) > 1:
            self.stack.pop()
            self.current = self.stack[-1]

    def handle_data(self, data):
        if data.strip():  # Ignore whitespace
            if "data" in self.current:
                self.current["data"] += data
            else:
                self.current["data"] = data

    def print__generate_lines(self, node, indent="", last=True, is_root=True):
        lines = []

        prefix = "" if is_root else ("└── " if last else "├── ")

        tag = node.get("tag")
        attrs = node.get("attrs", {})
        children = node.get("children", [])
        attrs_str = ' '.join(f'{key}="{value}"' for key, value in attrs.items())
        attrs_str = f' ({attrs_str})' if attrs_str else ''

        lines.append(f"{indent}{prefix}{tag}{attrs_str}")

        child_indent = indent + ("    " if last else "│   ")

        for i, child in enumerate(children):
            is_last = i == len(children) - 1
            child_lines = self.print__generate_lines(child, indent=child_indent, last=is_last, is_root=False)
            lines.extend(child_lines if isinstance(child_lines, list) else [child_lines])

        return lines if is_root else "\n".join(lines)

    def print(self, just_return_lines=False):
        if self.root:
            lines = self.print__generate_lines(self.root, is_root=True)
            if just_return_lines:
                return lines
            else:
                self.print__lines(lines)
        return self

    def print__lines(self, lines):
        for line in lines:
            print(line)

Ancestors

  • html.parser.HTMLParser
  • _markupbase.ParserBase

Methods

def convert(self)
Expand source code
def convert(self):
    self.feed(self.html)
    return self.root
def handle_data(self, data)
Expand source code
def handle_data(self, data):
    if data.strip():  # Ignore whitespace
        if "data" in self.current:
            self.current["data"] += data
        else:
            self.current["data"] = data
def handle_endtag(self, tag)
Expand source code
def handle_endtag(self, tag):
    if len(self.stack) > 1:
        self.stack.pop()
        self.current = self.stack[-1]
def handle_starttag(self, tag, attrs)
Expand source code
def handle_starttag(self, tag, attrs):
    new_tag = {"tag": tag, "attrs": dict(attrs), "children": [], "data": ""}

    if self.current is None:
        # When the first tag is encountered, it becomes the root
        self.root = new_tag
        self.current = new_tag
    else:
        # Otherwise, append the new tag as a child of the current tag
        self.current["children"].append(new_tag)

    # Update the stack and current pointers
    self.stack.append(new_tag)
    self.current = new_tag
def print(self, just_return_lines=False)
Expand source code
def print(self, just_return_lines=False):
    if self.root:
        lines = self.print__generate_lines(self.root, is_root=True)
        if just_return_lines:
            return lines
        else:
            self.print__lines(lines)
    return self
def print__generate_lines(self, node, indent='', last=True, is_root=True)
Expand source code
def print__generate_lines(self, node, indent="", last=True, is_root=True):
    lines = []

    prefix = "" if is_root else ("└── " if last else "├── ")

    tag = node.get("tag")
    attrs = node.get("attrs", {})
    children = node.get("children", [])
    attrs_str = ' '.join(f'{key}="{value}"' for key, value in attrs.items())
    attrs_str = f' ({attrs_str})' if attrs_str else ''

    lines.append(f"{indent}{prefix}{tag}{attrs_str}")

    child_indent = indent + ("    " if last else "│   ")

    for i, child in enumerate(children):
        is_last = i == len(children) - 1
        child_lines = self.print__generate_lines(child, indent=child_indent, last=is_last, is_root=False)
        lines.extend(child_lines if isinstance(child_lines, list) else [child_lines])

    return lines if is_root else "\n".join(lines)
def print__lines(self, lines)
Expand source code
def print__lines(self, lines):
    for line in lines:
        print(line)