微信扫码
与创始人交个朋友
我要投稿
? https://github.com/OpenBMB/MiniCPM-V
? https://huggingface.co/openbmb/MiniCPM-V-2_6
git clone https://huggingface.co/openbmb/MiniCPM-V-2_6
git clone https://github.com/vllm-project/vllm.gitcd vllmpip install e .
from PIL import Image
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
# 图像文件路径列表
IMAGES = [
"/root/ld/ld_project/MiniCPM-V/assets/airplane.jpeg",# 本地图片路径
]
# 模型名称或路径
MODEL_NAME = "/root/ld/ld_model_pretrained/Minicpmv2_6"# 本地模型路径或Hugging Face模型名称
# 打开并转换图像
image = Image.open(IMAGES[0]).convert("RGB")
# 初始化分词器
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# 初始化语言模型
llm = LLM(model=MODEL_NAME,
gpu_memory_utilization=1,# 使用全部GPU内存
trust_remote_code=True,
max_model_len=2048)# 根据内存状况可调整此值
# 构建对话消息
messages = [{'role': 'user', 'content': '(<image>./</image>)\n' + '请描述这张图片'}]
# 应用对话模板到消息
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# 设置停止符ID
# 2.0
# stop_token_ids = [tokenizer.eos_id]
# 2.5
#stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
# 2.6
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
# 设置生成参数
sampling_params = SamplingParams(
stop_token_ids=stop_token_ids,
# temperature=0.7,
# top_p=0.8,
# top_k=100,
# seed=3472,
max_tokens=1024,
# min_tokens=150,
temperature=0,
use_beam_search=True,
# length_penalty=1.2,
best_of=3)
# 获取模型输出
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
}
}, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
from transformers import AutoTokenizer
from decord import VideoReader, cpu
from PIL import Image
from vllm import LLM, SamplingParams
# 进行图片推理
MAX_NUM_FRAMES = 64
def encode_video(filepath):
def uniform_sample(l, n):
gap = len(l) / n
idxs = [int(i * gap + gap / 2) for i in range(n)]
return [l[i] for i in idxs]
vr = VideoReader(filepath, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1)# FPS
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if len(frame_idx)>MAX_NUM_FRAMES:
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
video = vr.get_batch(frame_idx).asnumpy()
video = [Image.fromarray(v.astype('uint8')) for v in video]
return video
MODEL_NAME = "openbmb/MiniCPM-V-2_6" # or local model path
video = encode_video("xxx.mp4")
messages = [{
"role":
"user",
"content":
"".join(["(<image>./</image>)"] * len(video)) + \
"\nPlease describe this video."
}]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
llm = LLM(
model=MODEL_NAME,
gpu_memory_utilization=1,
max_model_len=4096
)
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams(
stop_token_ids=stop_token_ids,
use_beam_search=True
temperature=0,
max_tokens=64
)
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": {
"images": video,
"use_image_id": False,
"max_slice_nums": 1 if len(video) > 16 else 2
}
}
}, sampling_params=sampling_params_beam)
git clone https://github.com/vllm-project/vllm.gitcd vllmpip install e .
vllm serve /root/ld/ld_model_pretrained/Minicpmv2_6 --dtype auto --max-model-len 2048 --api-key token-abc123 --gpu_memory_utilization 1 --trust-remote-code
from openai import OpenAI
openai_api_key = "token-abc123" # your api key set in launch server
openai_api_base = "http://localhost:8000/v1" # http id
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
chat_response = client.chat.completions.create(
model="/root/ld/ld_model_pretrained/Minicpmv2_6", # model_local_path or huggingface id
messages=[{
"role": "user",
"content": [
# NOTE: 使用图像令牌 <image> 的提示格式是不必要的,因为提示将由API服务器自动处理。
# 由于提示将由API服务器自动处理,因此不需要使用包含 <image> 图像令牌的提示格式。
{"type": "text", "text": "请描述这张图片"},
{
"type": "image_url",
"image_url": {
"url": "https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg",
},
},
],
}],
extra_body={
"stop_token_ids": [151645, 151643]
}
)
print("Chat response:", chat_response)
print("Chat response content:", chat_response.choices[0].message.content)
3.2 传入本地图片
from openai import OpenAI
openai_api_key = "token-abc123" # your api key set in launch server
openai_api_base = "http://localhost:8000/v1" # http id
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# 用于传本地图片
with open('your/local/pic/path','rb') as file:
image = "data:image/jpeg;base64,"+ base64.b64encode(file.read()).decode('utf-8')
chat_response = client.chat.completions.create(
model="/root/ld/ld_model_pretrained/Minicpmv2_6", # model_local_path or huggingface id
messages=[{
"role": "user",
"content": [
# NOTE: 使用图像令牌 <image> 的提示格式是不必要的,因为提示将由API服务器自动处理。
# 由于提示将由API服务器自动处理,因此不需要使用包含 <image> 图像令牌的提示格式。
{"type": "text", "text": "请描述这张图片"},
{
"type": "image_url",
"image_url": {
"url": image,
},
},
],
}],
extra_body={
"stop_token_ids": [151645, 151643]
}
)
print("Chat response:", chat_response)
print("Chat response content:", chat_response.choices[0].message.content)
设备要求:运行非量化版内存超过19g,运行量化版超过8g内存
brew install ffmpegbrew install pkg-config
git clone -b minicpm-v2.5 https://github.com/OpenBMB/llama.cpp.git
cd llama.cppmake
a. 首先前往huggingface或者modelscope下载pytorch权重:
git clone https://huggingface.co/openbmb/MiniCPM-V-2_6
# 第一行为获得模型中间输出,为转换为gguf作准备python ./examples/llava/minicpmv-convert/minicpmv2_6-surgery.py -m ../MiniCPM-V-2_6# 将siglip模型转换为ggufpython ./examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5# 将语言模型转换为ggufpython ./convert-hf-to-gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
方法二:
5. 开始推理:
5.1 图片推理指令 ./llama-minicpmv-cli -m ./Minicpmv2_6gguf/ggml-model-Q4_K_M.gguf --mmproj ./Minicpmv2_6gguf/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image ./Minicpmv2_6gguf/42.jpg -p "这张图片中有什么?"
5.2 视频推理指令
./llama-minicpmv-cli -m /Users/liudan/Downloads/Minicpmv2_6gguf/ggml-model-Q4_K_M.gguf --mmproj /Users/liudan/Downloads/Minicpmv2_6gguf/mmproj-model-f16.gguf -c 8192 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --video ./Minicpmv2_6gguf/test_vedieo.mp4 -p "我接下来会给你一个视频,请告诉我视频中描述了什么"
git clone -b minicpm-v2.6 https://github.com/OpenBMB/ollama.gitcd ollama/llm
brew install go cmake gcc
go generate ./...
go build .
./ollama serve
vim minicpmv2_6.Modelfile
FROM ./MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf
FROM ./MiniCPM-V-2_6/mmproj-model-f16.gguf
TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>{{ end }}
{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>{{ end }}
<|im_start|>assistant<|im_end|>
{{ .Response }}<|im_end|>"""
PARAMETER stop "<|endoftext|>"
PARAMETER stop "<|im_end|>"
PARAMETER num_ctx 2048
ollama create minicpm2.6 -f minicpmv2_6.Modelfile
ollama run minicpm2.6
What is described in this picture? /Users/liudan/Desktop/11.jpg
53AI,企业落地应用大模型首选服务商
产品:大模型应用平台+智能体定制开发+落地咨询服务
承诺:先做场景POC验证,看到效果再签署服务协议。零风险落地应用大模型,已交付160+中大型企业
2024-12-02
李飞飞:Agent AI 多模态交互的前沿探索
2024-12-02
使用Llama 3.2-Vision大模型,搭建本地Ollama OCR应用
2024-11-29
Molmo 7B:多模态智能下的文本提取
2024-11-28
360“纳米搜索”:一切皆可搜索,一切皆可生成视频
2024-11-28
周鸿祎发布纳米搜索,做世界第一的AI搜索
2024-11-26
简单到爆!Llama - OCR 仅需 3 步,小白也能完成高质量 OCR 识别!
2024-11-25
Encord全球首发多模态数据标注编辑器,AI数据开发技术有哪些新趋势?
2024-11-23
Pixtral Large:128K 上下文窗口 + 多模态融合,开启智能新视界!
2024-09-12
2024-05-30
2024-06-17
2024-08-06
2024-08-30
2024-04-21
2024-06-26
2024-07-07
2024-06-14
2024-07-21
2024-11-25
2024-09-26
2024-09-26
2024-09-01
2024-07-15
2024-07-14
2024-07-10
2024-07-02