qianfan.dataset.data_source package

data source including file

class qianfan.dataset.data_source.BosDataSource(*, region: str, bucket: str, bos_file_path: str, file_format: Optional[FormatType] = None, ak: Optional[str] = None, sk: Optional[str] = None)[source]

Bases: DataSource, BaseModel

Bos Data Source

async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously Read data from bos. Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

ak: Optional[str]
async asave(data: str, **kwargs: Any) bool[source]

Asynchronously export the data to specific bos storage and return whether the import was successful or failed Not available currently

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

bos_file_path: str
bucket: str
fetch(read_from_zip: bool = False, **kwargs: Any) Union[str, List[str]][source]

Read data from bos.

Args:
read_from_zip (bool):

does FileDataSource read data from a zip file, default to False

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

file_format: Optional[FormatType]
format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

region: str
save(data: Optional[str] = None, zip_file_path: Optional[str] = None, should_overwrite_existed_file: bool = False, **kwargs: Any) bool[source]

Export the data to specific bos storage and return whether the import was successful or failed

Args:
data (Optional[str]):

data need to be saved, default to None

zip_file_path (Optional[str]):

path of your zip file, default to None

should_overwrite_existed_file (bool):

should bos data source overwrite existed file when save data, default to False

**kwargs (Any):

optional arguments

Returns:

bool: is saving successful

set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

sk: Optional[str]
class qianfan.dataset.data_source.DataSource[source]

Bases: ABC

basic data source class

abstract async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

Union[str, List[str]]: content retrieved from data source

abstract async asave(data: str, **kwargs: Any) bool[source]

Asynchronously export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract fetch(**kwargs: Any) Union[str, List[str]][source]

Fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

Union[str, List[str]]: content retrieved from data source

abstract format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

abstract save(data: str, **kwargs: Any) bool[source]

Export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.data_source.FileDataSource(*, path: str, file_format: Optional[FormatType] = None, save_as_folder: bool = False)[source]

Bases: DataSource, BaseModel

file data source

async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously Read data from file. Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

async asave(data: Union[str, List[str]], **kwargs: Any) bool[source]

Asynchronously Write data to file。 Not available currently

Args:

data (Union[str, List[str]]): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

fetch(**kwargs: Any) Union[str, List[str]][source]

Read data from file.

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

file_format: Optional[FormatType]
format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

path: str
save(data: Union[str, List[str]], **kwargs: Any) bool[source]

Write data to file。

Args:

data (Union[str, List[str]]): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

save_as_folder: bool
set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.data_source.FormatType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

Enum for data source format type

Csv = 'csv'
Json = 'json'
Jsonl = 'jsonl'
Text = 'txt'
class qianfan.dataset.data_source.QianfanDataSource(*, id: str, group_id: str, name: str, set_type: DataSetType, project_type: DataProjectType, template_type: DataTemplateType, version: int, storage_type: DataStorageType, storage_id: str, storage_path: str, storage_raw_path: Optional[str] = None, storage_name: str, storage_region: Optional[str] = None, info: Dict[str, Any] = {}, download_when_init: bool = False, data_format_type: FormatType, old_dataset_id: Optional[int] = None, ak: Optional[str] = None, sk: Optional[str] = None)[source]

Bases: DataSource, BaseModel

Qianfan data source

async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously read data from qianfan or local cache。 Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

Union[str, List[str]]: content retrieved from data source

ak: Optional[str]
async asave(data: str, is_annotated: bool = False, **kwargs: Any) bool[source]

Asynchronously write data to qianfan currently only support to write to user BOS storage

Not available currently

Args:

data (str): data waiting to be uploaded。 is_annotated (bool): has data been annotated **kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

classmethod create_bare_dataset(name: str, template_type: DataTemplateType, storage_type: DataStorageType = DataStorageType.PublicBos, storage_id: Optional[str] = None, storage_path: Optional[str] = None, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

create bare dataset on qianfan as data source, which is empty Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PublicBos

storage_id (Optional[str]): private BOS bucket name,

needed when storage_type is PrivateBos, default to None

storage_path (Optional[str]): private BOS file path,

needed when storage_type is PrivateBos, default to None

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

classmethod create_from_bos_file(name: str, template_type: DataTemplateType, storage_id: str, storage_path: str, file_name: str, is_data_annotated: bool, storage_type: DataStorageType = DataStorageType.PrivateBos, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, is_download_to_local: bool = True, **kwargs: Any) QianfanDataSource[source]

create a dataset on qianfan as data source, which will import data from specific bos Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_id (str): private BOS bucket name storage_path (str): private BOS file path file_name (str): file need to upload is_data_annotated (bool): is data in bos annotated storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PrivateBos

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

is_download_to_local (bool):

does dataset download file when initialize object,default to True

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

data_format_type: FormatType
download_when_init: bool
fetch(**kwargs: Any) Union[str, List[str]][source]

Read data from qianfan or local cache。

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

Union[str, List[str]]: content retrieved from data source

format_type() FormatType[source]

Get format type binding to qianfan data source

Returns:

FormatType: format type binding to qianfan data source

classmethod get_existed_dataset(dataset_id: str, is_download_to_local: bool = True, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

Load a dataset from qianfan as data source

Args:

dataset_id (str): dataset id on Qianfan, show as “数据集版本 ID” is_download_to_local (bool):

does dataset download file when initialize object,default to True

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

group_id: str
id: str
info: Dict[str, Any]
name: str
old_dataset_id: Optional[int]
project_type: DataProjectType
release_dataset(**kwargs: Any) bool[source]

make a dataset released

Returns:

bool: Whether releasing succeeded

save(data: Optional[str] = None, zip_file_path: Optional[str] = None, is_annotated: bool = False, does_release: bool = False, sup_storage_id: str = '', sup_storage_path: str = '', sup_storage_region: str = '', **kwargs: Any) bool[source]

Write data to qianfan Currently only support to write to user BOS storage

Args:

data (str): data waiting to be uploaded. Default to None zip_file_path (Optional[str]):

zip file path which contains data files, default to None.

is_annotated (bool): has data been annotated, default to False does_release (bool):

does release dataset after saving successfully, default to False

sup_storage_id (Optional[str]):

bos bucket name used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_path (Optional[str]):

bos bucket file path used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_region (Optional[str]):

bos bucket region used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

**kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

set_format_type(format_type: FormatType) None[source]

Set format type binding to qianfan data source Not available

TextOnly -> Jsonl MultiModel -> Json

set_type: DataSetType
sk: Optional[str]
storage_id: str
storage_name: str
storage_path: str
storage_raw_path: Optional[str]
storage_region: Optional[str]
storage_type: DataStorageType
template_type: DataTemplateType
version: int

Submodules

qianfan.dataset.data_source.baidu_qianfan module

qianfan data source implementation including uploading / downloading

class qianfan.dataset.data_source.baidu_qianfan.QianfanDataSource(*, id: str, group_id: str, name: str, set_type: DataSetType, project_type: DataProjectType, template_type: DataTemplateType, version: int, storage_type: DataStorageType, storage_id: str, storage_path: str, storage_raw_path: Optional[str] = None, storage_name: str, storage_region: Optional[str] = None, info: Dict[str, Any] = {}, download_when_init: bool = False, data_format_type: FormatType, old_dataset_id: Optional[int] = None, ak: Optional[str] = None, sk: Optional[str] = None)[source]

Bases: DataSource, BaseModel

Qianfan data source

async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously read data from qianfan or local cache。 Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

Union[str, List[str]]: content retrieved from data source

ak: Optional[str]
async asave(data: str, is_annotated: bool = False, **kwargs: Any) bool[source]

Asynchronously write data to qianfan currently only support to write to user BOS storage

Not available currently

Args:

data (str): data waiting to be uploaded。 is_annotated (bool): has data been annotated **kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

classmethod create_bare_dataset(name: str, template_type: DataTemplateType, storage_type: DataStorageType = DataStorageType.PublicBos, storage_id: Optional[str] = None, storage_path: Optional[str] = None, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

create bare dataset on qianfan as data source, which is empty Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PublicBos

storage_id (Optional[str]): private BOS bucket name,

needed when storage_type is PrivateBos, default to None

storage_path (Optional[str]): private BOS file path,

needed when storage_type is PrivateBos, default to None

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

classmethod create_from_bos_file(name: str, template_type: DataTemplateType, storage_id: str, storage_path: str, file_name: str, is_data_annotated: bool, storage_type: DataStorageType = DataStorageType.PrivateBos, addition_info: Optional[Dict[str, Any]] = None, ak: Optional[str] = None, sk: Optional[str] = None, is_download_to_local: bool = True, **kwargs: Any) QianfanDataSource[source]

create a dataset on qianfan as data source, which will import data from specific bos Args:

name (str): dataset name you want template_type (DataTemplateType): template type applying to data set storage_id (str): private BOS bucket name storage_path (str): private BOS file path file_name (str): file need to upload is_data_annotated (bool): is data in bos annotated storage_type (Optional[DataStorageType]):

data storage type used to store your data, default to PrivateBos

addition_info (Optional[Dict[str, Any]]):

additional info you want to have,default to None

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

is_download_to_local (bool):

does dataset download file when initialize object,default to True

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

data_format_type: FormatType
download_when_init: bool
fetch(**kwargs: Any) Union[str, List[str]][source]

Read data from qianfan or local cache。

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:

Union[str, List[str]]: content retrieved from data source

format_type() FormatType[source]

Get format type binding to qianfan data source

Returns:

FormatType: format type binding to qianfan data source

classmethod get_existed_dataset(dataset_id: str, is_download_to_local: bool = True, ak: Optional[str] = None, sk: Optional[str] = None, **kwargs: Any) QianfanDataSource[source]

Load a dataset from qianfan as data source

Args:

dataset_id (str): dataset id on Qianfan, show as “数据集版本 ID” is_download_to_local (bool):

does dataset download file when initialize object,default to True

ak (Optional[str]):

console ak related to your dataset and bos,default to None

sk (Optional[str]):

console sk related to your dataset and bos,default to None

kwargs (Any): other arguments

Returns:

QianfanDataSource: A datasource represents your dataset on Qianfan

group_id: str
id: str
info: Dict[str, Any]
name: str
old_dataset_id: Optional[int]
project_type: DataProjectType
release_dataset(**kwargs: Any) bool[source]

make a dataset released

Returns:

bool: Whether releasing succeeded

save(data: Optional[str] = None, zip_file_path: Optional[str] = None, is_annotated: bool = False, does_release: bool = False, sup_storage_id: str = '', sup_storage_path: str = '', sup_storage_region: str = '', **kwargs: Any) bool[source]

Write data to qianfan Currently only support to write to user BOS storage

Args:

data (str): data waiting to be uploaded. Default to None zip_file_path (Optional[str]):

zip file path which contains data files, default to None.

is_annotated (bool): has data been annotated, default to False does_release (bool):

does release dataset after saving successfully, default to False

sup_storage_id (Optional[str]):

bos bucket name used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_path (Optional[str]):

bos bucket file path used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

sup_storage_region (Optional[str]):

bos bucket region used for uploading, we recommend to use this parameter when your destination dataset on qianfan is stored in public BOS. Default to empty str

**kwargs (Any): optional arguments。

Returns:

bool: has data been uploaded successfully

set_format_type(format_type: FormatType) None[source]

Set format type binding to qianfan data source Not available

TextOnly -> Jsonl MultiModel -> Json

set_type: DataSetType
sk: Optional[str]
storage_id: str
storage_name: str
storage_path: str
storage_raw_path: Optional[str]
storage_region: Optional[str]
storage_type: DataStorageType
template_type: DataTemplateType
version: int

qianfan.dataset.data_source.base module

base data source definition

class qianfan.dataset.data_source.base.DataSource[source]

Bases: ABC

basic data source class

abstract async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

Union[str, List[str]]: content retrieved from data source

abstract async asave(data: str, **kwargs: Any) bool[source]

Asynchronously export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract fetch(**kwargs: Any) Union[str, List[str]][source]

Fetch data from source

Args:

**kwargs (Any): optional arguments

Returns:

Union[str, List[str]]: content retrieved from data source

abstract format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

abstract save(data: str, **kwargs: Any) bool[source]

Export the data to the data source and return whether the import was successful or failed

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

abstract set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

class qianfan.dataset.data_source.base.FormatType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

Enum for data source format type

Csv = 'csv'
Json = 'json'
Jsonl = 'jsonl'
Text = 'txt'

qianfan.dataset.data_source.bos module

bos data source implementation including uploading / downloading

class qianfan.dataset.data_source.bos.BosDataSource(*, region: str, bucket: str, bos_file_path: str, file_format: Optional[FormatType] = None, ak: Optional[str] = None, sk: Optional[str] = None)[source]

Bases: DataSource, BaseModel

Bos Data Source

async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously Read data from bos. Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

ak: Optional[str]
async asave(data: str, **kwargs: Any) bool[source]

Asynchronously export the data to specific bos storage and return whether the import was successful or failed Not available currently

Args:

data (str): data need to be saved **kwargs (Any): optional arguments

Returns:

bool: is saving successful

bos_file_path: str
bucket: str
fetch(read_from_zip: bool = False, **kwargs: Any) Union[str, List[str]][source]

Read data from bos.

Args:
read_from_zip (bool):

does FileDataSource read data from a zip file, default to False

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

file_format: Optional[FormatType]
format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

region: str
save(data: Optional[str] = None, zip_file_path: Optional[str] = None, should_overwrite_existed_file: bool = False, **kwargs: Any) bool[source]

Export the data to specific bos storage and return whether the import was successful or failed

Args:
data (Optional[str]):

data need to be saved, default to None

zip_file_path (Optional[str]):

path of your zip file, default to None

should_overwrite_existed_file (bool):

should bos data source overwrite existed file when save data, default to False

**kwargs (Any):

optional arguments

Returns:

bool: is saving successful

set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

sk: Optional[str]

qianfan.dataset.data_source.file module

file data source implementation

class qianfan.dataset.data_source.file.FileDataSource(*, path: str, file_format: Optional[FormatType] = None, save_as_folder: bool = False)[source]

Bases: DataSource, BaseModel

file data source

async afetch(**kwargs: Any) Union[str, List[str]][source]

Asynchronously Read data from file. Not available currently

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

async asave(data: Union[str, List[str]], **kwargs: Any) bool[source]

Asynchronously Write data to file。 Not available currently

Args:

data (Union[str, List[str]]): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

fetch(**kwargs: Any) Union[str, List[str]][source]

Read data from file.

Args:

**kwargs (Any): Arbitrary keyword arguments.

Returns:
Union[str, List[str]]:

String or list of string containing the data read from the file.

file_format: Optional[FormatType]
format_type() FormatType[source]

Get format type binding to source

Returns:

FormatType: format type binding to source

path: str
save(data: Union[str, List[str]], **kwargs: Any) bool[source]

Write data to file。

Args:

data (Union[str, List[str]]): data waiting to be written。 **kwargs (Any): optional arguments。

Returns:

bool: has data been written successfully

save_as_folder: bool
set_format_type(format_type: FormatType) None[source]

Set format type binding to source

Args:

format_type (FormatType): format type binding to source

qianfan.dataset.data_source.utils module

utilities for data source