Source code for anchorman.positioner.slices

# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup


[docs]def element_slices(text, elements, element_identifier):
    """Get slices of all elements in text. """

    tokens = [e.keys()[0] for e in elements]
    tokens = "|".join([r"\b{}\b".format(t) for t in tokens])
    token_regex = re.compile(tokens)

    result = []
    for match in token_regex.finditer(text):
        token, _from, _to = match.group(), match.start(), match.end()
        base = [e for e in elements if e.keys()[0] == token][0]
        result.append((token, (_from, _to), (element_identifier, base)))

    return result


[docs]def unit_slices(text, text_unit):
    """Get slices of the text units specified in setting."""

    text_unit_key, text_unit_name = text_unit['key'], text_unit['name']
    text_unit_pair = (text_unit_key, text_unit_name)

    result = []
    if text_unit_pair == ('t', 'text'):
        # there is only one unit the whole text
        i, _from, _to = 0, 0, len(text)
        result.append((text_unit_key, (_from, _to), (text_unit_name, i)))

    elif text_unit_name.startswith(('html', 'xml')):
        soup = BeautifulSoup(text, "lxml")
        for i, a_text_unit in enumerate(soup.find_all(text_unit_key)):
            a_text_unit = str(a_text_unit)
            _from = text.index(a_text_unit)
            _to = _from + len(a_text_unit)
            result.append((text_unit_key, (_from, _to), (text_unit_name, i)))

    # todo
    # elif text_unit_pair == ('s', 'sentence'):
    #     # use some external library or at least have it in mind

    else:
        raise NotImplementedError

    return result

Source code for anchorman.positioner.slices

Anchorman

Related Topics

Source code for anchorman.positioner.slices

Anchorman

Related Topics

Quick search