128 lines
4.2 KiB
Python
128 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
||
"""测试GPU可用性和配置"""
|
||
|
||
import torch
|
||
import sys
|
||
|
||
def test_gpu():
|
||
print("=== GPU测试 ===")
|
||
print(f"PyTorch版本: {torch.__version__}")
|
||
print(f"CUDA可用: {torch.cuda.is_available()}")
|
||
|
||
if torch.cuda.is_available():
|
||
print(f"CUDA版本: {torch.version.cuda}")
|
||
print(f"GPU数量: {torch.cuda.device_count()}")
|
||
|
||
for i in range(torch.cuda.device_count()):
|
||
print(f"\nGPU {i}:")
|
||
print(f" 名称: {torch.cuda.get_device_name(i)}")
|
||
print(f" 内存总量: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
|
||
|
||
# 测试GPU张量
|
||
try:
|
||
x = torch.randn(3, 3).cuda(i)
|
||
y = torch.randn(3, 3).cuda(i)
|
||
z = torch.matmul(x, y)
|
||
print(f" GPU计算测试: 成功")
|
||
print(f" 当前内存使用: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
|
||
print(f" 最大内存使用: {torch.cuda.max_memory_allocated(i) / 1024**2:.2f} MB")
|
||
except Exception as e:
|
||
print(f" GPU计算测试: 失败 - {e}")
|
||
else:
|
||
print("警告: CUDA不可用,将使用CPU运行")
|
||
|
||
return torch.cuda.is_available()
|
||
|
||
def test_device_resolution():
|
||
print("\n=== 设备解析测试 ===")
|
||
|
||
def resolve_device(mode: str) -> str:
|
||
mode = mode.lower()
|
||
if mode == "cpu":
|
||
return "cpu"
|
||
if mode == "cuda":
|
||
if not torch.cuda.is_available():
|
||
raise SystemExit("device set to cuda but CUDA is not available")
|
||
return "cuda"
|
||
if torch.cuda.is_available():
|
||
return "cuda"
|
||
return "cpu"
|
||
|
||
test_cases = [
|
||
("auto", "应该自动选择GPU(如果可用)"),
|
||
("cpu", "应该强制使用CPU"),
|
||
("cuda", "应该使用GPU(如果可用)"),
|
||
]
|
||
|
||
for mode, description in test_cases:
|
||
try:
|
||
device = resolve_device(mode)
|
||
print(f"模式 '{mode}': {device} - {description}")
|
||
except SystemExit as e:
|
||
print(f"模式 '{mode}': 错误 - {e}")
|
||
except Exception as e:
|
||
print(f"模式 '{mode}': 异常 - {e}")
|
||
|
||
def test_training_components():
|
||
print("\n=== 训练组件测试 ===")
|
||
|
||
# 测试是否能导入关键模块
|
||
try:
|
||
from hybrid_diffusion import HybridDiffusionModel, cosine_beta_schedule
|
||
print("✓ hybrid_diffusion 模块导入成功")
|
||
|
||
# 创建一个简单的模型
|
||
model = HybridDiffusionModel(cont_dim=10, disc_vocab_sizes=[5, 3, 2])
|
||
|
||
if torch.cuda.is_available():
|
||
model = model.cuda()
|
||
print("✓ 模型可以移动到GPU")
|
||
else:
|
||
print("✓ 模型在CPU上")
|
||
|
||
# 测试前向传播
|
||
batch_size = 2
|
||
seq_len = 16
|
||
x_cont = torch.randn(batch_size, seq_len, 10)
|
||
x_disc = torch.randint(0, 5, (batch_size, seq_len, 3))
|
||
t = torch.randint(0, 100, (batch_size,))
|
||
|
||
if torch.cuda.is_available():
|
||
x_cont = x_cont.cuda()
|
||
x_disc = x_disc.cuda()
|
||
t = t.cuda()
|
||
model = model.cuda()
|
||
|
||
eps_pred, logits = model(x_cont, x_disc, t)
|
||
print(f"✓ 前向传播成功")
|
||
print(f" 连续输出形状: {eps_pred.shape}")
|
||
print(f" 离散输出数量: {len(logits)}")
|
||
|
||
except ImportError as e:
|
||
print(f"✗ 模块导入失败: {e}")
|
||
except Exception as e:
|
||
print(f"✗ 测试失败: {e}")
|
||
|
||
def main():
|
||
print("开始GPU和训练配置测试...")
|
||
|
||
gpu_available = test_gpu()
|
||
test_device_resolution()
|
||
test_training_components()
|
||
|
||
print("\n=== 使用建议 ===")
|
||
if gpu_available:
|
||
print("1. 使用GPU运行: python run_pipeline.py --device cuda")
|
||
print("2. 或自动选择: python run_pipeline.py --device auto")
|
||
print("3. 单独训练: python train.py --device cuda")
|
||
print("4. 单独采样: python export_samples.py --device cuda")
|
||
else:
|
||
print("1. 只能使用CPU运行: python run_pipeline.py --device cpu")
|
||
print("2. 检查CUDA和PyTorch安装")
|
||
print("3. 确保有NVIDIA GPU和正确的驱动程序")
|
||
|
||
return gpu_available
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1) |