tencent cloud

腾讯云 TI 平台

产品简介
产品概述
产品定价
客户价值
应用场景
购买指南
计费概述
购买方式
续费说明
欠费说明
安全合规
数据安全保护机制
监控、审计与日志
安全合规资质
快速入门
平台使用准备
操作指南
大模型广场
任务式建模
开发机
模型管理
模型评测
在线服务
资源组管理
数据源管理
Tikit
GPU 虚拟化
实践教程
LLM 部署及推理
LLM 训练及评测
内置训练镜像列表
自定义训练镜像规范
Angel 训练加速功能介绍
基于标签实现子用户间资源隔离
相关协议
TI Platform 隐私协议
TI Platform 数据处理和安全协议
开源软件信息
联系我们

提交训练任务

PDF
聚焦模式
字号
最后更新时间: 2025-06-20 18:36:41
describe_postpaid_training_price(self)
查询每种配置的每小时的价格
:rtype: tencentcloud.tione.v20211111.models.DescribeBillingSpecsResponse

describe_system_reasoning_images(self)
获取平台内置的推理镜像

:return: 推理镜像信息
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeInferTemplatesResponse`
数据格式如:
{
"FrameworkTemplates": [
{
"Framework": "TENSORFLOW",
"FrameworkVersion": "2.4",
"Groups": [
"TENSORFLOW",
"LIGHT"
],
"InferTemplates": [
{
"InferTemplateId": "tf2.4-py38-cpu",
"InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tensorflow2.4-cpu-20211206"
},
{
"InferTemplateId": "tf2.4-py38-gpu",
"InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-gpu:py38-tensorflow2.4-cu110-20211206"
}
]
}
],
"RequestId": "3654e19b-c2ba-4953-b131-d66495723008"
}

def create_training_task(self,
name,
framework,
cos_output,
worker_resource,
code_package_path,
ps_resource=None,
input_data_config=None,
worker_start_cmd=None,
ps_start_cmd=None,
tags=None,
tuning_parameters_dict={},
resource_group_id="",
remark=None,
log_enable=False,
log_logset_id=None,
log_topic_id=None,
vpc_id=None,
sub_net_id=None,
restart_limit=0):
"""创建训练任务

:param name: 任务名称
:type name: str
:param framework: 运行的框架环境
:type framework: :class:`tikit.models.FrameworkInfo`
:param cos_output: 输出的cos信息
:type cos_output: str
:param worker_resource: worker节点的配置
:type worker_resource: :class:`tikit.models.ResourceConfigInfo`
:param code_package_path: 代码的cos信息
:type code_package_path: str
:param ps_resource: ps节点的配置
:type ps_resource: :class:`tikit.models.ResourceConfigInfo`
:param input_data_config: 输入的数据信息
:type input_data_config: list or :class:`tikit.models.TrainingDataConfig`
:param worker_start_cmd: worker的启动命令
:type worker_start_cmd: str
:param ps_start_cmd: ps节点的启动命令
:type ps_start_cmd: str
:param tags: 标签
:type tags: list of :class:`tikit.tencentcloud.tione.v20211111.models.Tag`
:param tuning_parameters_dict: 调参字典
:type tuning_parameters_dict: dict
:param resource_group_id: 预付费的资源组id
:type resource_group_id: str
:param remark: 描述
:type remark: str
:param log_enable: 日志开关
:type log_enable: bool
:param log_logset_id: 日志集id
:type log_logset_id: str
:param log_topic_id: 日志的topic id
:type log_topic_id: str
:param vpc_id: vpc的id
:type vpc_id: str
:param sub_net_id: 子网id
:type sub_net_id: str
:param restart_limit: 当前任务最大重启次数,最高10次,超过后任务被标记为异常
:type restart_limit: int
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.CreateTrainingTaskResponse`
"""

def describe_training_frameworks(self):
"""查看训练框架

:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingFrameworksResponse`

"""

def describe_training_tasks(self, filters=None, tag_filters=None, offset=0, limit=50, order="DESC",
order_field="UpdateTime"):
"""获取训练任务列表

:param filters: 过滤器,eg:[{ "Name": "TaskStatus", "Values": ["Running"] }]
:type filters: list of Filter
:param tag_filters: 标签过滤器,eg:[{ "TagKey": "TagKeyA", "TagValue": ["TagValueA"] }]
:type tag_filters: list of TagFilter
:param offset: 偏移量,默认为0
:type offset: int
:param limit: 返回数量,默认为50
:type limit: int
:param order: 输出列表的排列顺序。取值范围:ASC:升序排列 DESC:降序排列
:type order: str
:param order_field: 排序的依据字段, 取值范围 "CreateTime" "UpdateTime"
:type order_field: str
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingTasksResponse`
"""

def describe_training_task(self, task_id)
获取单个训练任务信息
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskResponse`

def describe_training_task_pods(self, task_id)
获取训练任务的pod列表
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskPodsResponse`

def describe_train_logs(self, pod_name, start_time=None, end_time=None, limit=None, order=None,
context=None, filters=None):
"""查看训练任务的日志

:param pod_name: 查询哪个Pod的日志,支持通配符。查看某个训练任务的全部pod的日志可以填: "<task_id>-*",如:train-51cd6bf7ec1000-*
:type pod_name: str
:param start_time: 日志查询开始时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间的前一个小时
:type start_time: str
:param end_time: 日志查询结束时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间
:type end_time: str
:param limit: 日志查询条数,默认值100,最大值100
:type limit: int
:param order: 排序方向。(ASC | DESC) 默认值为DESC
:type order: str
:param context: 分页的游标
:type context: str
:param filters: 过滤Filters
:type filters: list of tikit.tencentcloud.tione.v20211111.models.Filter
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeLogsResponse`

返回的对象如果非空,就会有 next() 方法,能不断地获取下一页的日志(如果有多页的话),如下:
now_time = datetime.datetime.now(datetime.timezone.utc)
now_time_str = now_time.isoformat()
result = client.describe_train_logs("train-51cd6bf7ec1000-37c5p5nlr01s-launcher",
"2021-12-10T09:32:03.823509+00:00",
now_time_str,
limit=30)
print(result)
print(result.next())
print(result.next())
print(result.next())
"""

def push_training_metrics(self, timestamp, value_map, task_id=None, epoch=None, total_steps=None, step=None):
"""上报训练自定义指标(单条)。单个子账号每秒可以调用20次,请在您的训练代码中注意控制上报频率,避免超限报错。或者使用push_training_metrics_list

:param timestamp: 时间戳
:type timestamp: int
:param value_map: 指标映射。 指标名称 -> 指标值
:type value_map: map: str -> float
:param task_id: 任务ID。若为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值
:type task_id: str
:param epoch: epoch值
:type epoch: int
:param total_steps: 总步数
:type total_steps: int
:param step: 第几步
:type step: int
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`

client.push_training_metrics(int(time.time()), {"field1": 11, "field2": 12}, "task-id-00001", 3, 1000, 66)
"""

def push_training_metrics_list(self, metric_list)
上报训练自定义指标(列表)
:param metric_list: MetricData 数组。 若任务ID为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值
:type metric_list: list of :class:`tencentcloud.tione.v20211111.models.MetricData`
:return:
:rtype: :class:`tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`

def describe_training_metrics(self, task_id)
查询训练自定义指标
:param task_id: 任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingMetricsResponse`

def stop_training_task(self, task_id)
停止某个训练任务
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.StopTrainingTaskResponse`

def delete_training_task(self, task_id)
删除某个训练任务
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DeleteTrainingTaskResponse`


中间配置:
tikit.models.FrameworkInfo

def new_custom(training_mode, image_type, image_url, registry_region=None, registry_id=None,
user_name=None, passwd=None):
"""自定义训练框架的配置 通过describe_training_frameworks()查看列表

:param training_mode: 训练模式,如"DDP",仅训练任务需要配置
:type training_mode: str
:param image_type: 镜像类型,CCR为腾讯云容器镜像服务个人版,TCR为腾讯云容器镜像服务企业版,CUSTOM表示第三方自定义镜像
:type image_type: str
:param image_url: 镜像地址 必填
:type image_url: str
:param registry_region: 腾讯云容器镜像服务的镜像仓库的地域
:type registry_region: str
:param registry_id: 腾讯云容器镜像服务的镜像仓库ID
:type registry_id: str
:param user_name: 自定义镜像仓库的用户名
:type user_name: str
:param passwd: 自定义镜像仓库的密码
:type passwd: str
:return:
:rtype:
"""

def new_system_framework(framework_name, framework_environment, training_mode):
"""系统内置的训练框架

:param framework_name: 框架名称。 通过describe_training_frameworks()查看列表
:type framework_name: str
:param framework_environment: 框架环境。 通过describe_training_frameworks()查看列表
:type framework_environment: str
:param training_mode: 训练模式。 通过describe_training_frameworks()查看列表
:type training_mode: str
:return:
:rtype:
"""

----------------------------------------------------
tikit.models.ResourceConfigInfo

def new_postpaid(instance_type, instance_num):
"""获取后付费模式下的资源配置

:param instance_type: 实例类型。通过 describe_postpaid_training_price() 查看实例列表
:type instance_type: str
:param instance_num: 实例数量
:type instance_num: int
:return:
:rtype:
"""
|
def new_prepaid(cpu=0, memory=0, gpu=0, gpu_type=None, instance_num=1):
"""获取预付费模式下的资源配置,当资源组开启GPU按比例预设后,cpu和memory参数需要不设置或者设置为0,该两项的值将通过GPU卡数按照比例分配

:param cpu: CPU个数,单位是核
:type cpu: float
:param memory: 内存大小,单位是GB
:type memory: float
:param gpu_type: gpu类型
:type gpu_type: str
:param gpu: gpu个数
:type gpu: float
:param instance_num: 实例数量
:type instance_num: int
:return:
:rtype:
"""

----------------------------------------------------
tikit.models.TrainingDataConfig

def new_mount_cos(cos_str, target_path):
"""一个cos类型的训练数据

:param cos_str: cos存储,格式: <bucket>/<cos path>/
:type cos_str: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_dataset_mount(dataset_id, target_path):
"""一个dataset类型的训练数据

:param dataset_id: 数据集ID
:type dataset_id: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_cfs(cfs_id, source_path, target_path):
"""新建一个cfs类型的训练数据集配置

:param cfs_id: CFS的ID
:type cfs_id: str
:param source_path: CFS的路径
:type source_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_hdfs(hdfs_id, source_path, target_path):
"""新建一个hdfs类型的训练数据集配置

:param hdfs_id: EMR上HDFS的ID
:type hdfs_id: str
:param source_path: HDFS的路径
:type source_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_wedata_hdfs(wedata_id, source_path):
"""新建一个wedata hdfs类型的训练数据集配置

:param wedata_id: wedata数据源id
:type wedata_id: int
:param source_path: HDFS的路径
:type source_path: str
:return:
:rtype:
"""

def new_dataset(id_target_dict):
""" Deprecated !
新建一个dataset类型的训练数据集配置

:param id_target_dict: 数据集信息。 dataset id -> 下载的目标路径
:type id_target_dict: dict
:return:
:rtype:
"""

def new_cos_data(cos_str_target_dict):
"""Deprecated !
新建一个cos类型的训练数据集配置

:param cos_str_target_dict: 数据集信息。 <bucket>/<cos path>/ -> 下载的目标路径
:type cos_str_target_dict: dict
:return:
:rtype:
"""

def new_mount_goosefs(goosefs_id, source_path, namespace, target_path):
"""新建一个goosefs类型的训练数据集配置

:param goosefs_id: goosefs实例id
:type goosefs_id: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""
def new_mount_goosefsx(goosefsx_id, goosefsx_path, target_path):
"""新建一个goosefsx类型的存储配置

:param goosefsx_id: goosefsx实例id
:type goosefsx_id: str
:param goosefsx_path: goosefsx路径
:type goosefsx_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""



帮助和支持

本页内容是否解决了您的问题?

填写满意度调查问卷,共创更好文档体验。

文档反馈