Source code for qianfan.dataset.local_data_operators.utils

# Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
utilities for local data operators
"""
import re
from typing import List, Set, Union

import sentencepiece
from ltp import StnSplit

from qianfan.utils import log_warn

spliter = StnSplit()


[docs]def pyltp_split_sentence(document: str) -> List[str]:
    """use ltp to split sentence"""

    global spliter
    try:
        sentences = spliter.split(document)  # split sentences
    except Exception as e:
        err_msg = (
            f"error occurred during split document into sentence: {str(e)}, make"
            " sentences list empty"
        )
        log_warn(err_msg)
        sentences = []
    return sentences


[docs]class SentencePieceTokenizer(object):
    """sentencepiece package tokenizer"""

    def __init__(self, sentencepiece_model_path: str) -> None:
        self.sentencepiece_model_path: str = sentencepiece_model_path
        self.sentencepiece_model = sentencepiece.SentencePieceProcessor()
        self.sentencepiece_model.Load(sentencepiece_model_path)

[docs]    def tokenize(
        self, document: str, join_on_whitespace: bool = False
    ) -> Union[str, List[str]]:
        document_tokenized = self.sentencepiece_model.EncodeAsPieces(document)
        if join_on_whitespace:
            document_tokenized = " ".join(document_tokenized)
        return document_tokenized


[docs]def get_words_from_document(
    document: str,
    language: str,
    sentence_piece_tokenizer: SentencePieceTokenizer,
    need_to_lower: bool = True,
    strip_characters: Set[str] = set(),
) -> List[str]:
    """split word from document"""
    if language not in ["ZH"]:
        # 不是中文的话，才用空格\t等标记进行分词
        tokenizer = None
    else:
        # 只有中文才会使用sentence_piece_tokenizer进行分词
        tokenizer = sentence_piece_tokenizer

    if tokenizer:
        words = tokenizer.tokenize(document, join_on_whitespace=False)
    else:
        words = split_on_whitespace(document, new_line=True, tab=True)

    assert isinstance(words, list)

    if need_to_lower:
        words = [word.lower() for word in words]
    if strip_characters:
        words = [strip(word, strip_characters) for word in words]
        words = remove_empty_el_from_list(words)
    return words


[docs]def split_on_whitespace(
    document: str,
    new_line: bool = False,
    tab: bool = False,
) -> List[str]:
    """split document using whitespace"""
    sep = [" "] + new_line * ["\n"] + tab * ["\t"]
    sep_str = "|".join(sep)
    split_document = re.split(sep_str, document)
    split_document = remove_empty_el_from_list(split_document)
    return split_document


[docs]def remove_empty_el_from_list(list_: List[str]) -> List[str]:
    return [el for el in list_ if el]


[docs]def strip(document: str, strip_characters: Set[str]) -> str:
    """
    Way faster than document.strip(strip_characters)
    since strip_characters is now a set instead of a str,
    and it contains a lot of elements (all the emojis).
    """
    if not document:
        return document
    beg_ind = 0
    end_ind = len(document)
    for i in range(len(document)):
        if document[i] in strip_characters:
            beg_ind += 1
        else:
            break
    for i in range(1, len(document) + 1):
        if document[-i] in strip_characters:
            end_ind -= 1
        else:
            break
    document_stripped = document[beg_ind:end_ind]
    return document_stripped


[docs]def words_augmentation(words: List[str], group_size: int, join_char: str) -> List[str]:
    """Augment words, especially for Chinese (without a space between words)
    and Vietnamese (with a space between syllables)."""
    augmentation: List[str] = [
        join_char.join(words[i : i + group_size])
        for i in range(len(words) - group_size + 1)
    ]
    return augmentation


[docs]def get_augmentation_word_list(
    words: List[str], group_size_list: List[int], join_char: str
) -> List[str]:
    augmentation_list = [
        words_augmentation(words, group_size, join_char)
        for group_size in group_size_list
    ]
    return [word for augm in augmentation_list for word in augm]