56 lines
1.9 KiB
Python
56 lines
1.9 KiB
Python
# copy pasted from https://docs.vllm.ai/en/latest/getting_started/quickstart.html
|
|
|
|
# do export VLLM_USE_MODELSCOPE=True
|
|
import vllm
|
|
from vllm import LLM, SamplingParams
|
|
|
|
import torch
|
|
|
|
def test_pytorch():
|
|
print('\n----- Test PyTorch ---')
|
|
# Print the PyTorch version and CUDA version
|
|
print(f"PyTorch version: {torch.__version__}")
|
|
print(f"CUDA version: {torch.version.cuda}")
|
|
|
|
# Perform a matrix multiplication on CUDA and print the result
|
|
result = torch.randn(2, 4).cuda() @ torch.randn(4, 1).cuda()
|
|
print(f"Matrix multiplication result: {result}")
|
|
|
|
# Check CUDA availability and device details
|
|
print(f'Number of CUDA devices: {torch.cuda.device_count()}')
|
|
if torch.cuda.device_count() > 0:
|
|
print(f'Device name: {torch.cuda.get_device_name(0)}')
|
|
else:
|
|
print("No CUDA devices available.")
|
|
|
|
def test_vllm():
|
|
print('\n----- Test vLLM ---')
|
|
prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
]
|
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
|
|
|
|
llm = LLM(model="facebook/opt-125m")
|
|
# llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
|
|
|
|
outputs: iter = llm.generate(prompts, sampling_params)
|
|
print(f'{type(outputs)=}')
|
|
print(f'{type(outputs[0])=}')
|
|
|
|
# Print the outputs.
|
|
output: vllm.outputs.RequestOutput
|
|
for output in outputs:
|
|
prompt: str = output.prompt
|
|
generated_text: str = output.outputs[0].text
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
|
|
if __name__ == "__main__":
|
|
import time
|
|
start_time = time.time()
|
|
test_pytorch()
|
|
test_vllm()
|
|
print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a") |