专业免费网站建设一般多少钱,做网站全包,搜索网站不显示图片,番禺网站开发服务简介
github demo
使用网络获取的油画图片#xff0c;InternVL识别还算可以。
使用stable diffusion生成的图片#xff0c;InternVL能很好的识别。
权重
huggingface地址 模型搭建
github地址
下载源码
git clone https://github.com/OpenGVLab/InternVL.git创建环…简介
github demo
使用网络获取的油画图片InternVL识别还算可以。
使用stable diffusion生成的图片InternVL能很好的识别。
权重
huggingface地址 模型搭建
github地址
下载源码
git clone https://github.com/OpenGVLab/InternVL.git创建环境
conda create -n internvl python3.9 -y
conda activate internvl下载pytorch依赖要求PyTorch2.0torchvision0.15.2CUDA11.6
conda install pytorch2.0.1 torchvision0.15.2 torchaudio2.0.2 pytorch-cuda11.8 -c pytorch -c nvidia
# or
pip install torch2.0.1 torchvision0.15.2 torchaudio2.0.2 --index-url https://download.pytorch.org/whl/cu118下载 flash-attn0.2.8这是因为不同版本的 flash attention 会产生细微的结果差异。
git clone https://github.com/Dao-AILab/flash-attention.git
cd flash-attention
git checkout v0.2.8
python setup.py install下载 timm0.6.11 、mmcv-full1.6.2
pip install -U openmim
pip install timm0.6.11
mim install mmcv-full1.6.2下载 transformers4.32.0
pip install transformers4.32.0下载 apex
git clone https://github.com/NVIDIA/apex.git
git checkout 2386a912164b0c5cfcd8be7a2b890fbac5607c82 # https://github.com/NVIDIA/apex/issues/1735
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings --build-option--cpp_ext --config-settings --build-option--cuda_ext ./下载依赖包
pip install opencv-python termcolor yacs pyyaml scipy使用Huggingface快速开始
InternViT-6B
import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessormodel AutoModel.from_pretrained(OpenGVLab/InternViT-6B-224px,torch_dtypetorch.bfloat16,low_cpu_mem_usageTrue,trust_remote_codeTrue).cuda().eval()image Image.open(./examples/image1.jpg).convert(RGB)image_processor CLIPImageProcessor.from_pretrained(OpenGVLab/InternViT-6B-224px)pixel_values image_processor(imagesimage, return_tensorspt).pixel_values
pixel_values pixel_values.to(torch.bfloat16).cuda()outputs model(pixel_values)
InternVL-C(ontrastive) and InternVL-G(enerative)
import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessor
from transformers import AutoTokenizermodel AutoModel.from_pretrained(OpenGVLab/InternVL-14B-224px,torch_dtypetorch.bfloat16,low_cpu_mem_usageTrue,trust_remote_codeTrue).cuda().eval()image_processor CLIPImageProcessor.from_pretrained(OpenGVLab/InternVL-14B-224px)tokenizer AutoTokenizer.from_pretrained(OpenGVLab/InternVL-14B-224px, use_fastFalse, add_eos_tokenTrue)
tokenizer.pad_token_id 0 # set pad_token_id to 0images [Image.open(./examples/image1.jpg).convert(RGB),Image.open(./examples/image2.jpg).convert(RGB),Image.open(./examples/image3.jpg).convert(RGB)
]
prefix summarize:
texts [prefix a photo of a red panda, # Englishprefix 一张熊猫的照片, # Chineseprefix 二匹の猫の写真 # Japanese
]pixel_values image_processor(imagesimages, return_tensorspt).pixel_values
pixel_values pixel_values.to(torch.bfloat16).cuda()
input_ids tokenizer(texts, return_tensorspt, max_length80,truncationTrue, paddingmax_length).input_ids.cuda()# InternVL-C
logits_per_image, logits_per_text model(imagepixel_values, textinput_ids, modeInternVL-C)
probs logits_per_image.softmax(dim-1)
# tensor([[9.9609e-01, 5.2185e-03, 6.0070e-08],
# [2.2949e-02, 9.7656e-01, 5.9903e-06],
# [3.2932e-06, 7.4863e-05, 1.0000e00]], devicecuda:0,
# dtypetorch.bfloat16, grad_fnSoftmaxBackward0)# InternVL-G
logits_per_image, logits_per_text model(imagepixel_values, textinput_ids, modeInternVL-G)
probs logits_per_image.softmax(dim-1)
# tensor([[9.9609e-01, 3.1738e-03, 3.6322e-08],
# [8.6060e-03, 9.9219e-01, 2.8759e-06],
# [1.7583e-06, 3.1233e-05, 1.0000e00]], devicecuda:0,
# dtypetorch.bfloat16, grad_fnSoftmaxBackward0)# please set add_eos_token to False for generation
tokenizer.add_eos_token False
image Image.open(./examples/image1.jpg).convert(RGB)
pixel_values image_processor(imagesimage, return_tensorspt).pixel_values
pixel_values pixel_values.to(torch.bfloat16).cuda()tokenized tokenizer(English caption:, return_tensorspt)
pred model.generate(pixel_valuespixel_values,input_idstokenized.input_ids.cuda(),attention_masktokenized.attention_mask.cuda(),num_beams5,min_new_tokens8,
)
caption tokenizer.decode(pred[0].cpu(), skip_special_tokensTrue).strip()
# English caption: a red panda sitting on top of a wooden platform