qianfan.dataset package

Library aimed to helping developer to interactive with Dataset

class qianfan.dataset.DataExportDestinationType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: int, Enum

PlatformBos: int = 0
PrivateBos: int = 1
class qianfan.dataset.DataProjectType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: int, Enum

Project type used by Qianfan Data

Conversation: int = 20
GenericText: int = 401
QuerySet: int = 402
Text2Image: int = 705
class qianfan.dataset.DataSetType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: int, Enum

MultiModel: int = 7
TextOnly: int = 4
class qianfan.dataset.DataSource[source]

Bases: ABC

basic data source class

abstract async afetch(**kwargs: Any) str[source]

Asynchronously fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

str: content retrieved from data source

abstract async asave(data: str, **kwargs: Any) bool[source]

Asynchronously export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract fetch(**kwargs: Any) str[source]

Fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

str: content retrieved from data source

abstract format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

abstract save(data: str, **kwargs: Any) bool[source]

Export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.DataSourceType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: int, Enum

PrivateBos: int = 1
SharedZipUrl: int = 2
class qianfan.dataset.DataStorageType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: str, Enum

PrivateBos: str = 'usrBos'
PublicBos: str = 'sysBos'
class qianfan.dataset.DataTemplateType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: int, Enum

Template type used by Qianfan Data

GenericText: int = 40100
NonSortedConversation: int = 2000
QuerySet: int = 40200
SortedConversation: int = 2001
Text2Image: int = 70500
class qianfan.dataset.Dataset(inner_table: Table, inner_data_source_cache: Optional[DataSource] = None, inner_schema_cache: Optional[Schema] = None)[source]

Bases: Table

add_default_group_column() Self[source]

add “_group” column to Dataset, the value in “_group” column are sequential incremental

Returns:

Self: Dataset itself

append(elem: Any, add_new_group: bool = False, is_grouped: bool = True) Self[source]

append element(s) to dataset

Args:
elem (Union[List[List[Dict]], List[Dict], Tuple[Dict], Dict]):

Elements added to dataset

add_new_group (bool):

Whether elem has a new group id. Only used when dataset is grouped.

is_grouped (bool):

Are element in elem in same group. Only used when dataset is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Dataset itself

col_append(elem: Any) Self[source]

append a row to dataset

Args:
elem (Dict[str, List]): dict containing element added to dataset

must have column name “name” and column data list “data”

Returns:

Self: Dataset itself

col_delete(index: Union[int, str]) Self[source]

delete an column from dataset

Args:

index (str): column name to delete

Returns:

Self: Dataset itself

col_filter(op: Callable[[Any], bool]) Self[source]

filter on dataset’s column

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Dataset itself

col_insert(elem: Any, index: Any) Self[source]

append a row to dataset

Args:
elem (Dict[str, List]): dict containing element added to dataset

must has column name “name” and column data list “data”

index (int): where to insert new column

Returns:

Self: Dataset itself

col_list(by: Optional[Union[slice, int, str, List[int], Tuple[int], List[str], Tuple[str]]] = None) Any[source]

get column(s) from dataset

Args:
by (Optional[Union[int, str, Sequence[int], Sequence[str]]]):

index or indices for columns, default to None, in which case return a python list of dataset column

Returns:

Any: dataset column list

col_map(op: Callable[[Any], Any]) Self[source]

map on dataset’s column

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Dataset itself

col_names() List[str][source]

get column name list

Returns:

List[str]: column name list

col_renames(new_names: List[str]) Self[source]

rename all dataset column

Args:

new_names (List[str]): All new names for columns

Returns:

Self: A brand-new Dataset with new name

classmethod create_from_pyarrow_table(table: Table, schema: Optional[Schema] = None) Dataset[source]

create a dataset from pyarrow table

Args:

table (pyarrow): pyarrow table object used to create dataset。 schema (Optional[Schema]):

schema used to validate before exporting data, default to None

Returns:

Dataset: a dataset instance

classmethod create_from_pyobj(data: Union[List[Dict[str, Any]], Dict[str, List]], schema: Optional[Schema] = None) Dataset[source]

create a dataset from python dict or list

Args:
data (Union[List[Dict[str, Any]], Dict[str, List]]):

python object used to create dataset。

schema (Optional[Schema]):

schema used to validate before exporting data, default to None

Returns:

Dataset: a dataset instance

delete(index: Union[int, str]) Self[source]

delete an element from dataset

Args:

index (Union[int, str]): element index to delete

Returns:

Self: Dataset itself

delete_group_column() Self[source]

remove “_group” column from Dataset

Returns:

Self: Dataset itself

filter(op: Callable[[Any], bool]) Self[source]

filter on dataset

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Dataset itself

insert(elem: Any, index: Any, group_id: int = -1, add_new_group: bool = False, is_grouped: bool = True) Self[source]

insert element(s) to dataset

Args:
elem (Union[List[List[Dict]], List[Dict], Tuple[Dict], Dict]):

Elements added to dataset

index (int): where to insert element(s) group_id (int):

which group id you want to apply to new element(s). Default to -1, which means let group id be automatically inferred from table.

add_new_group (bool):

Whether elem has a new group id. Only used when dataset is grouped and group_id is -1

is_grouped (bool):

Are element in elem in same group. Only used when dataset is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Dataset itself

is_dataset_located_in_qianfan() bool[source]

tell whether current is cloud dataset

Returns:

bool: whether current is cloud dataset

list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None, **kwargs: Any) Any[source]

get element(s) from dataset

Args:
by (Optional[Union[slice, int, Sequence[int]]]):

index or indices for elements, default to None, in which case return a python list of dataset row

Returns:

Any: dataset row list

classmethod load(source: Optional[DataSource] = None, data_file: Optional[str] = None, qianfan_dataset_id: Optional[int] = None, bos_load_args: Optional[Dict[str, Any]] = None, huggingface_dataset: Optional[Any] = None, schema: Optional[Schema] = None, organize_data_as_group: bool = False, **kwargs: Any) Dataset[source]

Read data from the source or create a source from the parameters and create a Table instance. If a schema is specified, perform validation after importing.

Args:
source (Optional[DataSource]): where dataset load from,

default to None,in which case, a datasource will be created inside dataset using parameters below

data_file (Optional[str]):

dataset local file path, default to None

qianfan_dataset_id (Optional[int]):

qianfan dataset ID, default to None

bos_load_args: (Optional[Dict[str, Any]]):

create a dataset and import initial dataset content from args

huggingface_dataset (Optional[Dict[str, Any], Any]):

Huggingface dataset object, only support DatasetDict and Dataset of Huggingface datasets.

schema (Optional[Schema]):

schema used to validate loaded data, default to None

organize_data_as_group (bool):

only available when data source’s format is FormatType.Jsonl. Indicates whether organize data within dataset in group format, default to False, and when it’s True, the default format will be a group-based 2D structure.

**kwargs (Any): optional arguments

Returns:

Dataset: a dataset instance

map(op: Callable[[Any], Any]) Self[source]

map on dataset

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Dataset itself

online_data_process(operators: List[QianfanOperator]) Dict[str, Any][source]

create an online ETL task on qianfan not available currently

Args:

operators (List[QianfanOperator]): operators applied to ETL task

Returns:
Dict[str, Any]: ETL task info, contains 3 field:

is_succeeded (bool): whether ETL task succeed etl_task_id (Optional[int]): etl task id, only

exists when etl task is created successfully

new_dataset_id (Optional[int]): dataset id which

stores data after etl, only exists when etl task is succeeded

save(destination: Optional[DataSource] = None, data_file: Optional[str] = None, qianfan_dataset_id: Optional[int] = None, qianfan_dataset_create_args: Optional[Dict[str, Any]] = None, schema: Optional[Schema] = None, replace_source: bool = False, **kwargs: Any) bool[source]

Write data to source if a schema has been passed, validate data before exporting

Args:
destination (Optional[DataSource]):

data source where dataset exports,default to None. in which case, a datasource will be created inside dataset using parameters below

data_file (Optional[str]):

dataset local file path, default to None

qianfan_dataset_id (Optional[int]):

qianfan dataset ID, default to None

qianfan_dataset_create_args: (Optional[Dict[str: Any]]):

create arguments for creating a bare dataset on qianfan, default to None

schema: (Optional[Schema]):

schema used to validate before exporting data, default to None

replace_source: (bool):

if replace the original source, default to False

kwargs (Any): optional arguments

Returns:

bool: is saving succeeded

class qianfan.dataset.FileDataSource(*, path: str, file_format: Optional[FormatType] = None)[source]

Bases: DataSource, BaseModel

file data source

async afetch(**kwargs: Any) str[source]

Asynchronously Read data from file. Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data read from the file.

async asave(data: str, **kwargs: Any) bool[source]

Asynchronously Write data to file。 Not available currently

Args:

data (str): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

fetch(**kwargs: Any) str[source]

Read data from file.

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data read from the file.

file_format: Optional[FormatType]
format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'file_format': FieldInfo(annotation=Union[FormatType, NoneType], required=False), 'path': FieldInfo(annotation=str, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

path: str
save(data: str, **kwargs: Any) bool[source]

Write data to file。

Args:

data (str): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.QianfanDataSource(*, id: int, group_id: int, name: str, set_type: DataSetType, project_type: DataProjectType, template_type: DataTemplateType, version: int, storage_type: DataStorageType, storage_id: str, storage_path: str, storage_raw_path: Optional[str] = None, storage_name: str, storage_region: Optional[str] = None, info: Dict[str, Any] = {}, download_when_init: bool = False, data_format_type: FormatType, ak: Optional[str] = None, sk: Optional[str] = None)[source]

Bases: DataSource, BaseModel

Qianfan data source

async afetch(**kwargs: Any) str[source]

Asynchronously read data from qianfan or local cache。 Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data.

ak: Optional[str]
async asave(data: str, is_annotated: bool = False, **kwargs: Any) bool[source]

Asynchronously write data to qianfan currently only support to write to user BOS storage

Not available currently

Args:

data (str): data waiting to be uploaded。 is_annotated (bool): has data been annotated **kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

classmethod create_bare_dataset(name: str, template_type: DataTemplateType, storage_type: DataStorageType = DataStorageType.PublicBos, storage_id: Optional[str] = None, storage_path: Optional[str] = None, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

create bare dataset on qianfan as data source, which is empty Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PublicBos

storage_id (Optional[str]): private BOS bucket name,

needed when storage_type is PrivateBos, default to None

storage_path (Optional[str]): private BOS file path,

needed when storage_type is PrivateBos, default to None

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

classmethod create_from_bos_file(name: str, template_type: DataTemplateType, storage_id: str, storage_path: str, file_name: str, is_data_annotated: bool, storage_type: DataStorageType = DataStorageType.PrivateBos, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, is_download_to_local: bool = True, **kwargs: Any) QianfanDataSource[source]

create a dataset on qianfan as data source, which will import data from specific bos Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_id (str): private BOS bucket name storage_path (str): private BOS file path file_name (str): file need to upload is_data_annotated (bool): is data in bos annotated storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PrivateBos

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

is_download_to_local (bool):

does dataset download file when initialize object,default to True

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

data_format_type: FormatType
download_when_init: bool
fetch(**kwargs: Any) str[source]

Read data from qianfan or local cache。

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data.

format_type() FormatType[source]

Get format type binding to qianfan data source

Returns:

FormatType: format type binding to qianfan data source

classmethod get_existed_dataset(dataset_id: int, is_download_to_local: bool = True, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

Load a dataset from qianfan as data source

Args:

dataset_id (int): dataset id on Qianfan, show as “数据集版本 ID” is_download_to_local (bool):

does dataset download file when initialize object,default to True

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

group_id: int
id: int
info: Dict[str, Any]
model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'ak': FieldInfo(annotation=Union[str, NoneType], required=False), 'data_format_type': FieldInfo(annotation=FormatType, required=True), 'download_when_init': FieldInfo(annotation=bool, required=False, default=False), 'group_id': FieldInfo(annotation=int, required=True), 'id': FieldInfo(annotation=int, required=True), 'info': FieldInfo(annotation=Dict[str, Any], required=False, default={}), 'name': FieldInfo(annotation=str, required=True), 'project_type': FieldInfo(annotation=DataProjectType, required=True), 'set_type': FieldInfo(annotation=DataSetType, required=True), 'sk': FieldInfo(annotation=Union[str, NoneType], required=False), 'storage_id': FieldInfo(annotation=str, required=True), 'storage_name': FieldInfo(annotation=str, required=True), 'storage_path': FieldInfo(annotation=str, required=True), 'storage_raw_path': FieldInfo(annotation=Union[str, NoneType], required=False), 'storage_region': FieldInfo(annotation=Union[str, NoneType], required=False), 'storage_type': FieldInfo(annotation=DataStorageType, required=True), 'template_type': FieldInfo(annotation=DataTemplateType, required=True), 'version': FieldInfo(annotation=int, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

name: str
project_type: DataProjectType
release_dataset(**kwargs: Any) bool[source]

make a dataset released

Returns:

bool: Whether releasing succeeded

save(data: str, is_annotated: bool = False, does_release: bool = False, sup_storage_id: str = '', sup_storage_path: str = '', sup_storage_region: str = '', **kwargs: Any) bool[source]

Write data to qianfan Currently only support to write to user BOS storage

Args:

data (str): data waiting to be uploaded。 is_annotated (bool): has data been annotated, default to False does_release (bool):

does release dataset after saving successfully, default to False

sup_storage_id (Optional[str]):

bos bucket name used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_path (Optional[str]):

bos bucket file path used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_region (Optional[str]):

bos bucket region used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

**kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

set_format_type(format_type: FormatType) None[source]

Set format type binding to qianfan data source Not available

TextOnly -> Jsonl MultiModel -> Json

set_type: DataSetType
sk: Optional[str]
storage_id: str
storage_name: str
storage_path: str
storage_raw_path: Optional[str]
storage_region: Optional[str]
storage_type: DataStorageType
template_type: DataTemplateType
version: int
class qianfan.dataset.Table(inner_table: Table)[source]

Bases: Addable, Listable, Processable

dataset representation on memory inherited from pyarrow.Table,implementing interface in process_interface.py

append(elem: Any, add_new_group: bool = False, is_grouped: bool = True) Self[source]

append an element to pyarrow table

Args:

elem (Union[List[Dict], Tuple[Dict], Dict]): Elements added to pyarrow table add_new_group (bool):

Whether elem has a new group id. Only used when table is grouped.

is_grouped (bool):

Are element in elem in same group. Only used when table is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Table itself

col_append(elem: Any) Self[source]

append a row to pyarrow table

Args:
elem (Dict[str, List]): dict containing element added to pyarrow table

must has column name “name” and column data list “data”

Returns:

Self: Table itself

col_delete(index: Union[int, str]) Self[source]

delete a column from pyarrow table

Args:

index (str): column name to delete

Returns:

Self: Table itself

col_filter(op: Callable[[Any], bool]) Self[source]

filter on pyarrow table’s column

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Table itself

col_insert(elem: Any, index: Any) Self[source]

append a row to pyarrow table

Args:
elem (Dict[str, List]): dict containing element added to pyarrow table

must has column name “name” and column data list “data”

index (int): where to insert new column

Returns:

Self: Table itself

col_list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None) Any[source]

get column(s) from pyarrow table

Args:
by (Optional[Union[int, str, Sequence[int], Sequence[str]]]):

index or indices for columns, default to None, in which case return a python list of pyarrow table column

Returns:

Any: pyarrow table column list

col_map(op: Callable[[Any], Any]) Self[source]

map on pyarrow table’s column

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Table itself

col_names() List[str][source]

get column name list

Returns:

List[str]: column name list

col_renames(new_names: List[str]) Self[source]

rename all dataset column

Args:

new_names (List[str]): All new names for columns

Returns:

Self: A brand-new Table with new name

column_number() int[source]

get pyarrow table column count。

Returns:

int: column count。

delete(index: Union[int, str]) Self[source]

delete an element from pyarrow table

Args:

index (Union[int, str]): element index to delete

Returns:

Self: Table itself

filter(op: Callable[[Any], bool]) Self[source]

filter on pyarrow table’s row

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Table itself

insert(elem: Any, index: Any, group_id: int = -1, add_new_group: bool = False, is_grouped: bool = True) Self[source]

insert an element to pyarrow table

Args:

elem (Union[List[Dict], Tuple[Dict], Dict]): Elements added to pyarrow table index (int): where to insert element(s) group_id (int):

which group id you want to apply to new element(s). Default to -1, which means let group id be automatically inferred from table.

add_new_group (bool):

Whether elem has a new group id. Only used when table is grouped and group_id is -1

is_grouped (bool):

Are element in elem in same group. Only used when table is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Table itself

is_dataset_grouped() bool[source]
is_dataset_packed() bool[source]
list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None) Any[source]

get element(s) from pyarrow table

Args:
by (Optional[Union[slice, int, Sequence[int]]]):

index or indices for elements, default to None, in which case return a python list of pyarrow table row

Returns:

Any: pyarrow table row list

map(op: Callable[[Any], Any]) Self[source]

map on pyarrow table’s row

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Table itself

pack() bool[source]

pack all group into 1 row and make table array-like with single column

Returns:

bool: whether packing succeeded

row_number() int[source]

get pyarrow table row count。

Returns:

int: row count。

to_pydict() Dict[source]

convert a pyarrow table to dict

Returns:

Dict: a dict

to_pylist() List[source]

convert a pyarrow table to list

Returns:

List: a list

unpack() bool[source]

unpack all element in the row “_pack” make sure the element in the column “_pack” is Sequence[Dict[str, Any]]

Returns:

bool: whether unpacking succeeded

Submodules

qianfan.dataset.consts module

constants for dataset using

qianfan.dataset.data_operator module

data operator for qianfan online not available currently

class qianfan.dataset.data_operator.DeduplicationSimhash(*, operator_name: str = 'deduplication_simhash', operator_type: str = 'deduplication', distance: float)[source]

Bases: Deduplicator

Deduplicator class to deduplicate by simhash

distance: float
model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'distance': FieldInfo(annotation=float, required=True), 'operator_name': FieldInfo(annotation=str, required=False, default='deduplication_simhash'), 'operator_type': FieldInfo(annotation=str, required=False, default='deduplication')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.Deduplicator(*, operator_name: str, operator_type: str = 'deduplication')[source]

Bases: QianfanOperator

Deduplicator class for online ETL operator

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=True), 'operator_type': FieldInfo(annotation=str, required=False, default='deduplication')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_type: str
class qianfan.dataset.data_operator.DesensitizationProcessor(*, operator_name: str, operator_type: str = 'desensitization')[source]

Bases: QianfanOperator

Sensitive data processor class for online ETL operator

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=True), 'operator_type': FieldInfo(annotation=str, required=False, default='desensitization')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_type: str
class qianfan.dataset.data_operator.ExceptionRegulator(*, operator_name: str, operator_type: str = 'clean')[source]

Bases: QianfanOperator

Exception class for online ETL operator

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=True), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_type: str
class qianfan.dataset.data_operator.Filter(*, operator_name: str, operator_type: str = 'filter')[source]

Bases: QianfanOperator

Filter class for online ETL operator

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=True), 'operator_type': FieldInfo(annotation=str, required=False, default='filter')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_type: str
class qianfan.dataset.data_operator.FilterCheckCharacterRepetitionRemoval(*, operator_name: str = 'filter_check_character_repetition_removal', operator_type: str = 'filter', character_repetition_max_cutoff: float)[source]

Bases: Filter

Filter class to check character repetition removal

character_repetition_max_cutoff: float
model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'character_repetition_max_cutoff': FieldInfo(annotation=float, required=True), 'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_character_repetition_removal'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.FilterCheckFlaggedWords(*, operator_name: str = 'filter_check_flagged_words', operator_type: str = 'filter', flagged_words_max_cutoff: float)[source]

Bases: Filter

Filter class to check flagged words

flagged_words_max_cutoff: float
model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'flagged_words_max_cutoff': FieldInfo(annotation=float, required=True), 'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_flagged_words'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.FilterCheckLangId(*, operator_name: str = 'filter_check_lang_id', operator_type: str = 'filter', lang_id_min_cutoff: float)[source]

Bases: Filter

Filter class to check lang id

lang_id_min_cutoff: float
model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'lang_id_min_cutoff': FieldInfo(annotation=float, required=True), 'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_lang_id'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.FilterCheckNumberWords(*, operator_name: str = 'filter_check_number_words', operator_type: str = 'filter', number_words_min_cutoff: int = 1, number_words_max_cutoff: int = 10000)[source]

Bases: Filter

Filter class to check number of words

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'number_words_max_cutoff': FieldInfo(annotation=int, required=False, default=10000), 'number_words_min_cutoff': FieldInfo(annotation=int, required=False, default=1), 'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_number_words'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

number_words_max_cutoff: int
number_words_min_cutoff: int
operator_name: str
class qianfan.dataset.data_operator.FilterCheckPerplexity(*, operator_name: str = 'filter_check_perplexity', operator_type: str = 'filter', perplexity_max_cutoff: int)[source]

Bases: Filter

Filter class to check perplexity

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_perplexity'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter'), 'perplexity_max_cutoff': FieldInfo(annotation=int, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
perplexity_max_cutoff: int
class qianfan.dataset.data_operator.FilterCheckSpecialCharacters(*, operator_name: str = 'filter_check_special_characters', operator_type: str = 'filter', special_characters_max_cutoff: float)[source]

Bases: Filter

Filter class to check special characters

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_special_characters'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter'), 'special_characters_max_cutoff': FieldInfo(annotation=float, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
special_characters_max_cutoff: float
class qianfan.dataset.data_operator.FilterCheckWordRepetitionRemoval(*, operator_name: str = 'filter_check_word_repetition_removal', operator_type: str = 'filter', word_repetition_max_cutoff: float)[source]

Bases: Filter

Filter class to check word repetition removal

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='filter_check_word_repetition_removal'), 'operator_type': FieldInfo(annotation=str, required=False, default='filter'), 'word_repetition_max_cutoff': FieldInfo(annotation=float, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
word_repetition_max_cutoff: float
class qianfan.dataset.data_operator.QianfanOperator(*, operator_name: str, operator_type: str)[source]

Bases: BaseModel

Basic class for online ETL operator

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=True), 'operator_type': FieldInfo(annotation=str, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
operator_type: str
class qianfan.dataset.data_operator.RemoveEmoji(*, operator_name: str = 'remove_emoji', operator_type: str = 'clean')[source]

Bases: ExceptionRegulator

Exception class to remove emoji

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='remove_emoji'), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.RemoveInvisibleCharacter(*, operator_name: str = 'remove_invisible_character', operator_type: str = 'clean')[source]

Bases: ExceptionRegulator

Exception class to remove invisible character

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='remove_invisible_character'), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.RemoveNonMeaningCharacters(*, operator_name: str = 'remove_non_meaning_characters', operator_type: str = 'clean')[source]

Bases: ExceptionRegulator

Exception class to remove non-meaning characters

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='remove_non_meaning_characters'), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.RemoveWebIdentifiers(*, operator_name: str = 'remove_web_identifiers', operator_type: str = 'clean')[source]

Bases: ExceptionRegulator

Exception class to remove web identifiers

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='remove_web_identifiers'), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.ReplaceEmails(*, operator_name: str = 'replace_emails', operator_type: str = 'desensitization')[source]

Bases: DesensitizationProcessor

Sensitive data processor class to replace emails

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='replace_emails'), 'operator_type': FieldInfo(annotation=str, required=False, default='desensitization')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.ReplaceIdentifier(*, operator_name: str = 'replace_identifier', operator_type: str = 'desensitization')[source]

Bases: DesensitizationProcessor

Sensitive data processor class to replace identifier

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='replace_identifier'), 'operator_type': FieldInfo(annotation=str, required=False, default='desensitization')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.ReplaceIp(*, operator_name: str = 'replace_ip', operator_type: str = 'desensitization')[source]

Bases: DesensitizationProcessor

Sensitive data processor class to replace ip

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='replace_ip'), 'operator_type': FieldInfo(annotation=str, required=False, default='desensitization')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.ReplaceTraditionalChineseToSimplified(*, operator_name: str = 'replace_traditional_chinese_to_simplified', operator_type: str = 'clean')[source]

Bases: ExceptionRegulator

Exception class to replace traditional chinese to simplified

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='replace_traditional_chinese_to_simplified'), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str
class qianfan.dataset.data_operator.ReplaceUniformWhitespace(*, operator_name: str = 'replace_uniform_whitespace', operator_type: str = 'clean')[source]

Bases: ExceptionRegulator

Exception class to replace uniform whitespace

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'operator_name': FieldInfo(annotation=str, required=False, default='replace_uniform_whitespace'), 'operator_type': FieldInfo(annotation=str, required=False, default='clean')}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

operator_name: str

qianfan.dataset.data_source module

data source which is related to download/upload

class qianfan.dataset.data_source.DataSource[source]

Bases: ABC

basic data source class

abstract async afetch(**kwargs: Any) str[source]

Asynchronously fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

str: content retrieved from data source

abstract async asave(data: str, **kwargs: Any) bool[source]

Asynchronously export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract fetch(**kwargs: Any) str[source]

Fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

str: content retrieved from data source

abstract format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

abstract save(data: str, **kwargs: Any) bool[source]

Export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.data_source.FileDataSource(*, path: str, file_format: Optional[FormatType] = None)[source]

Bases: DataSource, BaseModel

file data source

async afetch(**kwargs: Any) str[source]

Asynchronously Read data from file. Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data read from the file.

async asave(data: str, **kwargs: Any) bool[source]

Asynchronously Write data to file。 Not available currently

Args:

data (str): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

fetch(**kwargs: Any) str[source]

Read data from file.

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data read from the file.

file_format: Optional[FormatType]
format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'file_format': FieldInfo(annotation=Union[FormatType, NoneType], required=False), 'path': FieldInfo(annotation=str, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

path: str
save(data: str, **kwargs: Any) bool[source]

Write data to file。

Args:

data (str): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.data_source.FormatType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

Enum for data source format type

Csv = 'csv'
Json = 'json'
Jsonl = 'jsonl'
Text = 'txt'
class qianfan.dataset.data_source.QianfanDataSource(*, id: int, group_id: int, name: str, set_type: DataSetType, project_type: DataProjectType, template_type: DataTemplateType, version: int, storage_type: DataStorageType, storage_id: str, storage_path: str, storage_raw_path: Optional[str] = None, storage_name: str, storage_region: Optional[str] = None, info: Dict[str, Any] = {}, download_when_init: bool = False, data_format_type: FormatType, ak: Optional[str] = None, sk: Optional[str] = None)[source]

Bases: DataSource, BaseModel

Qianfan data source

async afetch(**kwargs: Any) str[source]

Asynchronously read data from qianfan or local cache。 Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data.

ak: Optional[str]
async asave(data: str, is_annotated: bool = False, **kwargs: Any) bool[source]

Asynchronously write data to qianfan currently only support to write to user BOS storage

Not available currently

Args:

data (str): data waiting to be uploaded。 is_annotated (bool): has data been annotated **kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

classmethod create_bare_dataset(name: str, template_type: DataTemplateType, storage_type: DataStorageType = DataStorageType.PublicBos, storage_id: Optional[str] = None, storage_path: Optional[str] = None, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

create bare dataset on qianfan as data source, which is empty Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PublicBos

storage_id (Optional[str]): private BOS bucket name,

needed when storage_type is PrivateBos, default to None

storage_path (Optional[str]): private BOS file path,

needed when storage_type is PrivateBos, default to None

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

classmethod create_from_bos_file(name: str, template_type: DataTemplateType, storage_id: str, storage_path: str, file_name: str, is_data_annotated: bool, storage_type: DataStorageType = DataStorageType.PrivateBos, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, is_download_to_local: bool = True, **kwargs: Any) QianfanDataSource[source]

create a dataset on qianfan as data source, which will import data from specific bos Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_id (str): private BOS bucket name storage_path (str): private BOS file path file_name (str): file need to upload is_data_annotated (bool): is data in bos annotated storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PrivateBos

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

is_download_to_local (bool):

does dataset download file when initialize object,default to True

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

data_format_type: FormatType
download_when_init: bool
fetch(**kwargs: Any) str[source]

Read data from qianfan or local cache。

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

str: A string containing the data.

format_type() FormatType[source]

Get format type binding to qianfan data source

Returns:

FormatType: format type binding to qianfan data source

classmethod get_existed_dataset(dataset_id: int, is_download_to_local: bool = True, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

Load a dataset from qianfan as data source

Args:

dataset_id (int): dataset id on Qianfan, show as “数据集版本 ID” is_download_to_local (bool):

does dataset download file when initialize object,default to True

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

group_id: int
id: int
info: Dict[str, Any]
model_config: ClassVar[ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'ak': FieldInfo(annotation=Union[str, NoneType], required=False), 'data_format_type': FieldInfo(annotation=FormatType, required=True), 'download_when_init': FieldInfo(annotation=bool, required=False, default=False), 'group_id': FieldInfo(annotation=int, required=True), 'id': FieldInfo(annotation=int, required=True), 'info': FieldInfo(annotation=Dict[str, Any], required=False, default={}), 'name': FieldInfo(annotation=str, required=True), 'project_type': FieldInfo(annotation=DataProjectType, required=True), 'set_type': FieldInfo(annotation=DataSetType, required=True), 'sk': FieldInfo(annotation=Union[str, NoneType], required=False), 'storage_id': FieldInfo(annotation=str, required=True), 'storage_name': FieldInfo(annotation=str, required=True), 'storage_path': FieldInfo(annotation=str, required=True), 'storage_raw_path': FieldInfo(annotation=Union[str, NoneType], required=False), 'storage_region': FieldInfo(annotation=Union[str, NoneType], required=False), 'storage_type': FieldInfo(annotation=DataStorageType, required=True), 'template_type': FieldInfo(annotation=DataTemplateType, required=True), 'version': FieldInfo(annotation=int, required=True)}

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

This replaces Model.__fields__ from Pydantic V1.

name: str
project_type: DataProjectType
release_dataset(**kwargs: Any) bool[source]

make a dataset released

Returns:

bool: Whether releasing succeeded

save(data: str, is_annotated: bool = False, does_release: bool = False, sup_storage_id: str = '', sup_storage_path: str = '', sup_storage_region: str = '', **kwargs: Any) bool[source]

Write data to qianfan Currently only support to write to user BOS storage

Args:

data (str): data waiting to be uploaded。 is_annotated (bool): has data been annotated, default to False does_release (bool):

does release dataset after saving successfully, default to False

sup_storage_id (Optional[str]):

bos bucket name used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_path (Optional[str]):

bos bucket file path used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_region (Optional[str]):

bos bucket region used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

**kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

set_format_type(format_type: FormatType) None[source]

Set format type binding to qianfan data source Not available

TextOnly -> Jsonl MultiModel -> Json

set_type: DataSetType
sk: Optional[str]
storage_id: str
storage_name: str
storage_path: str
storage_raw_path: Optional[str]
storage_region: Optional[str]
storage_type: DataStorageType
template_type: DataTemplateType
version: int

qianfan.dataset.dataset module

dataset core concept, a wrap of data processing, data transmission and data validation

class qianfan.dataset.dataset.Dataset(inner_table: Table, inner_data_source_cache: Optional[DataSource] = None, inner_schema_cache: Optional[Schema] = None)[source]

Bases: Table

add_default_group_column() Self[source]

add “_group” column to Dataset, the value in “_group” column are sequential incremental

Returns:

Self: Dataset itself

append(elem: Any, add_new_group: bool = False, is_grouped: bool = True) Self[source]

append element(s) to dataset

Args:
elem (Union[List[List[Dict]], List[Dict], Tuple[Dict], Dict]):

Elements added to dataset

add_new_group (bool):

Whether elem has a new group id. Only used when dataset is grouped.

is_grouped (bool):

Are element in elem in same group. Only used when dataset is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Dataset itself

col_append(elem: Any) Self[source]

append a row to dataset

Args:
elem (Dict[str, List]): dict containing element added to dataset

must have column name “name” and column data list “data”

Returns:

Self: Dataset itself

col_delete(index: Union[int, str]) Self[source]

delete an column from dataset

Args:

index (str): column name to delete

Returns:

Self: Dataset itself

col_filter(op: Callable[[Any], bool]) Self[source]

filter on dataset’s column

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Dataset itself

col_insert(elem: Any, index: Any) Self[source]

append a row to dataset

Args:
elem (Dict[str, List]): dict containing element added to dataset

must has column name “name” and column data list “data”

index (int): where to insert new column

Returns:

Self: Dataset itself

col_list(by: Optional[Union[slice, int, str, List[int], Tuple[int], List[str], Tuple[str]]] = None) Any[source]

get column(s) from dataset

Args:
by (Optional[Union[int, str, Sequence[int], Sequence[str]]]):

index or indices for columns, default to None, in which case return a python list of dataset column

Returns:

Any: dataset column list

col_map(op: Callable[[Any], Any]) Self[source]

map on dataset’s column

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Dataset itself

col_names() List[str][source]

get column name list

Returns:

List[str]: column name list

col_renames(new_names: List[str]) Self[source]

rename all dataset column

Args:

new_names (List[str]): All new names for columns

Returns:

Self: A brand-new Dataset with new name

classmethod create_from_pyarrow_table(table: Table, schema: Optional[Schema] = None) Dataset[source]

create a dataset from pyarrow table

Args:

table (pyarrow): pyarrow table object used to create dataset。 schema (Optional[Schema]):

schema used to validate before exporting data, default to None

Returns:

Dataset: a dataset instance

classmethod create_from_pyobj(data: Union[List[Dict[str, Any]], Dict[str, List]], schema: Optional[Schema] = None) Dataset[source]

create a dataset from python dict or list

Args:
data (Union[List[Dict[str, Any]], Dict[str, List]]):

python object used to create dataset。

schema (Optional[Schema]):

schema used to validate before exporting data, default to None

Returns:

Dataset: a dataset instance

delete(index: Union[int, str]) Self[source]

delete an element from dataset

Args:

index (Union[int, str]): element index to delete

Returns:

Self: Dataset itself

delete_group_column() Self[source]

remove “_group” column from Dataset

Returns:

Self: Dataset itself

filter(op: Callable[[Any], bool]) Self[source]

filter on dataset

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Dataset itself

insert(elem: Any, index: Any, group_id: int = -1, add_new_group: bool = False, is_grouped: bool = True) Self[source]

insert element(s) to dataset

Args:
elem (Union[List[List[Dict]], List[Dict], Tuple[Dict], Dict]):

Elements added to dataset

index (int): where to insert element(s) group_id (int):

which group id you want to apply to new element(s). Default to -1, which means let group id be automatically inferred from table.

add_new_group (bool):

Whether elem has a new group id. Only used when dataset is grouped and group_id is -1

is_grouped (bool):

Are element in elem in same group. Only used when dataset is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Dataset itself

is_dataset_located_in_qianfan() bool[source]

tell whether current is cloud dataset

Returns:

bool: whether current is cloud dataset

list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None, **kwargs: Any) Any[source]

get element(s) from dataset

Args:
by (Optional[Union[slice, int, Sequence[int]]]):

index or indices for elements, default to None, in which case return a python list of dataset row

Returns:

Any: dataset row list

classmethod load(source: Optional[DataSource] = None, data_file: Optional[str] = None, qianfan_dataset_id: Optional[int] = None, bos_load_args: Optional[Dict[str, Any]] = None, huggingface_dataset: Optional[Any] = None, schema: Optional[Schema] = None, organize_data_as_group: bool = False, **kwargs: Any) Dataset[source]

Read data from the source or create a source from the parameters and create a Table instance. If a schema is specified, perform validation after importing.

Args:
source (Optional[DataSource]): where dataset load from,

default to None,in which case, a datasource will be created inside dataset using parameters below

data_file (Optional[str]):

dataset local file path, default to None

qianfan_dataset_id (Optional[int]):

qianfan dataset ID, default to None

bos_load_args: (Optional[Dict[str, Any]]):

create a dataset and import initial dataset content from args

huggingface_dataset (Optional[Dict[str, Any], Any]):

Huggingface dataset object, only support DatasetDict and Dataset of Huggingface datasets.

schema (Optional[Schema]):

schema used to validate loaded data, default to None

organize_data_as_group (bool):

only available when data source’s format is FormatType.Jsonl. Indicates whether organize data within dataset in group format, default to False, and when it’s True, the default format will be a group-based 2D structure.

**kwargs (Any): optional arguments

Returns:

Dataset: a dataset instance

map(op: Callable[[Any], Any]) Self[source]

map on dataset

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Dataset itself

online_data_process(operators: List[QianfanOperator]) Dict[str, Any][source]

create an online ETL task on qianfan not available currently

Args:

operators (List[QianfanOperator]): operators applied to ETL task

Returns:
Dict[str, Any]: ETL task info, contains 3 field:

is_succeeded (bool): whether ETL task succeed etl_task_id (Optional[int]): etl task id, only

exists when etl task is created successfully

new_dataset_id (Optional[int]): dataset id which

stores data after etl, only exists when etl task is succeeded

save(destination: Optional[DataSource] = None, data_file: Optional[str] = None, qianfan_dataset_id: Optional[int] = None, qianfan_dataset_create_args: Optional[Dict[str, Any]] = None, schema: Optional[Schema] = None, replace_source: bool = False, **kwargs: Any) bool[source]

Write data to source if a schema has been passed, validate data before exporting

Args:
destination (Optional[DataSource]):

data source where dataset exports,default to None. in which case, a datasource will be created inside dataset using parameters below

data_file (Optional[str]):

dataset local file path, default to None

qianfan_dataset_id (Optional[int]):

qianfan dataset ID, default to None

qianfan_dataset_create_args: (Optional[Dict[str: Any]]):

create arguments for creating a bare dataset on qianfan, default to None

schema: (Optional[Schema]):

schema used to validate before exporting data, default to None

replace_source: (bool):

if replace the original source, default to False

kwargs (Any): optional arguments

Returns:

bool: is saving succeeded

qianfan.dataset.process_interface module

interface file

class qianfan.dataset.process_interface.Addable[source]

Bases: ABC

make object ‘addable’

abstract append(elem: Any) Self[source]

append an element at Appendable object

Args:

elem (Any): element to append

Returns:

Self: a new Addable object after appending

abstract insert(elem: Any, index: Any) Self[source]

insert an element to Appendable object

Args:

elem (Any): element(s) to insert index (Any): where to insert element(s)

Returns:

Self: a new Addable object after inserting

class qianfan.dataset.process_interface.Listable[source]

Bases: ABC

make object ‘listable’

abstract list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None) Any[source]

get an element from object

Args:
by (Optional[Union[slice, int, str, Sequence[int], Sequence[str]]):

index used to get data or data list, default to None

Returns:

Any: elements

class qianfan.dataset.process_interface.Processable[source]

Bases: ABC

make object ‘processable’

abstract delete(index: Union[int, str]) Self[source]

delete an element from Processable object

Args:

index (Union[int, str]): element index to delete

Returns:

Self: a new Processable object after delete

abstract filter(op: Callable[[Any], bool]) Self[source]

filter on a Processable object

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: a new Processable object after filtering

abstract map(op: Callable[[Any], Any]) Self[source]

map on a Processable object

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: a new Processable object after mapping

qianfan.dataset.schema module

schema for validation currently qianfan schema only

class qianfan.dataset.schema.QianfanGenericText[source]

Bases: QianfanSchema

validator for generic text dataset

validate(table: Table) bool[source]

validate a table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

class qianfan.dataset.schema.QianfanNonSortedConversation[source]

Bases: QianfanSchema

validator for non-sorted, conversational dataset

validate(table: Table) bool[source]

validate a table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

class qianfan.dataset.schema.QianfanQuerySet[source]

Bases: QianfanSchema

validator for query set dataset

validate(table: Table) bool[source]

validate a table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

class qianfan.dataset.schema.QianfanSchema[source]

Bases: Schema

validate(table: Table) bool[source]

validate a dataset.Table object currently check field and type only, not content in table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

class qianfan.dataset.schema.QianfanSortedConversation[source]

Bases: QianfanSchema

validator for sorted, conversational dataset

validate(table: Table) bool[source]

validate a table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

class qianfan.dataset.schema.QianfanText2Image[source]

Bases: QianfanSchema

validator for text to image dataset

validate(table: Table) bool[source]

validate a table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

class qianfan.dataset.schema.Schema[source]

Bases: ABC

abstract validate(table: Table) bool[source]

validate a dataset.Table object currently check field and type only, not content in table

Args:

table (Table): table need to be validated

Returns:

bool:whether table is valid

qianfan.dataset.table module

wrapper for pyarrow.Table

class qianfan.dataset.table.Table(inner_table: Table)[source]

Bases: Addable, Listable, Processable

dataset representation on memory inherited from pyarrow.Table,implementing interface in process_interface.py

append(elem: Any, add_new_group: bool = False, is_grouped: bool = True) Self[source]

append an element to pyarrow table

Args:

elem (Union[List[Dict], Tuple[Dict], Dict]): Elements added to pyarrow table add_new_group (bool):

Whether elem has a new group id. Only used when table is grouped.

is_grouped (bool):

Are element in elem in same group. Only used when table is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Table itself

col_append(elem: Any) Self[source]

append a row to pyarrow table

Args:
elem (Dict[str, List]): dict containing element added to pyarrow table

must has column name “name” and column data list “data”

Returns:

Self: Table itself

col_delete(index: Union[int, str]) Self[source]

delete a column from pyarrow table

Args:

index (str): column name to delete

Returns:

Self: Table itself

col_filter(op: Callable[[Any], bool]) Self[source]

filter on pyarrow table’s column

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Table itself

col_insert(elem: Any, index: Any) Self[source]

append a row to pyarrow table

Args:
elem (Dict[str, List]): dict containing element added to pyarrow table

must has column name “name” and column data list “data”

index (int): where to insert new column

Returns:

Self: Table itself

col_list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None) Any[source]

get column(s) from pyarrow table

Args:
by (Optional[Union[int, str, Sequence[int], Sequence[str]]]):

index or indices for columns, default to None, in which case return a python list of pyarrow table column

Returns:

Any: pyarrow table column list

col_map(op: Callable[[Any], Any]) Self[source]

map on pyarrow table’s column

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Table itself

col_names() List[str][source]

get column name list

Returns:

List[str]: column name list

col_renames(new_names: List[str]) Self[source]

rename all dataset column

Args:

new_names (List[str]): All new names for columns

Returns:

Self: A brand-new Table with new name

column_number() int[source]

get pyarrow table column count。

Returns:

int: column count。

delete(index: Union[int, str]) Self[source]

delete an element from pyarrow table

Args:

index (Union[int, str]): element index to delete

Returns:

Self: Table itself

filter(op: Callable[[Any], bool]) Self[source]

filter on pyarrow table’s row

Args:

op (Callable[[Any], bool]): handler used to filter

Returns:

Self: Table itself

insert(elem: Any, index: Any, group_id: int = -1, add_new_group: bool = False, is_grouped: bool = True) Self[source]

insert an element to pyarrow table

Args:

elem (Union[List[Dict], Tuple[Dict], Dict]): Elements added to pyarrow table index (int): where to insert element(s) group_id (int):

which group id you want to apply to new element(s). Default to -1, which means let group id be automatically inferred from table.

add_new_group (bool):

Whether elem has a new group id. Only used when table is grouped and group_id is -1

is_grouped (bool):

Are element in elem in same group. Only used when table is grouped and elem is Sequence and add_new_group was set True. Default to True, all elements will be in same group. If it’s True, each element will have sequential incremental group id from last available group id.

Returns:

Self: Table itself

is_dataset_grouped() bool[source]
is_dataset_packed() bool[source]
list(by: Optional[Union[slice, int, str, Sequence[int], Sequence[str]]] = None) Any[source]

get element(s) from pyarrow table

Args:
by (Optional[Union[slice, int, Sequence[int]]]):

index or indices for elements, default to None, in which case return a python list of pyarrow table row

Returns:

Any: pyarrow table row list

map(op: Callable[[Any], Any]) Self[source]

map on pyarrow table’s row

Args:

op (Callable[[Any], Any]): handler used to map

Returns:

Self: Table itself

pack() bool[source]

pack all group into 1 row and make table array-like with single column

Returns:

bool: whether packing succeeded

row_number() int[source]

get pyarrow table row count。

Returns:

int: row count。

to_pydict() Dict[source]

convert a pyarrow table to dict

Returns:

Dict: a dict

to_pylist() List[source]

convert a pyarrow table to list

Returns:

List: a list

unpack() bool[source]

unpack all element in the row “_pack” make sure the element in the column “_pack” is Sequence[Dict[str, Any]]

Returns:

bool: whether unpacking succeeded

qianfan.dataset.utils module

utilities dataset needs