Windows10下使用SAM(Segment Anything Model)大模型实现万物皆可分割！-51CTO.COM

SAM(Segment Anything Model)，顾名思义，即为分割一切！该模型由Facebook的Meta AI实验室，能够根据文本指令或图像识别，实现对任意物体的识别与分割。它的诞生，无疑是CV领域的一次重要里程碑。

论文地址：https://arxiv.org/abs/2304.02643

在前文《从零解读SAM(Segment Anything Model)大模型！万物皆可分割！(含源码解析)》中从实现原理到源码解析对SAM大模型进行了详细解读，本文将演示Windows10下SAM大模型的实际使用过程！

SAM模型运行环境安装

1.环境要求

Python 3.8+
Pytorch 1.7+
Torchvision>=0.8

2.查看CUDA版本号

nvidia-smi

3.安装GPU版本的Pytorch

根据自己的cuda版本选择对应的版本，生成安装命令。

SAM模型代码使用

首先下载Github源码以及所提供的权重文件。

1.predictor_example

predictor_example.ipynb源码在notebooks文件目录下，可以本地运行测试。

步骤一：查看测试图片

import cv2
import matplotlib.pyplot as plt
image = cv2.imread('img.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('on')
plt.show()

步骤二：显示前景和背景的标记点

import numpy as np
import matplotlib.pyplot as plt
import cv2

def show_points(coords, labels, ax, marker_size=375):
    # 从coords中筛选出前景点(pos_points)和背景点(neg_points)
    # 如果labels中的元素为1，则对应的坐标点被视为前景；如果为0，则视为背景
    pos_points = coords[labels == 1]
    neg_points = coords[labels == 0]
    # 绘制前景点和背景点
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 前景的标记点显示
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 背景的标记点显示

# 读取图像文件
image = cv2.imread('img.png')
# 将图像从BGR色彩空间转换为RGB色彩空间
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# 二维数组，存储了三个预设的坐标点
input_point = np.array([[230, 194], [182, 63], [339, 279]])
# 一维数组，与坐标点一一对应，指定了每个点是前景（1）还是背景（0）
input_label = np.array([1, 1, 0])

plt.figure(figsize=(10, 10))
plt.imshow(image)
show_points(input_point, input_label, plt.gca())
plt.axis('on')
plt.show()

这里图片可以用画图软件打开查看像素坐标辅助标定。

步骤三：标记点完成前景目标的分割

简单的调用源码模型，就能完成前景目标的分割，源码提供了三种不同大小的模型，我们也可以自己去尝试不同的模型效果。

import numpy as np
import matplotlib.pyplot as plt
import cv2

# 在matplotlib的坐标轴ax上展示一个掩膜图像mask
def show_mask(mask, ax, random_color=False):
    # 如果random_color为真，则通过np.random.random(3)生成一个随机的RGB颜色向量，
    # 并与透明度值（0.6）拼接，形成一个RGBA颜色数组
    # 否则，使用预设的蓝色透明色值
    if random_color:    # 掩膜颜色是否随机决定
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    # 获取掩膜的高h和宽w
    h, w = mask.shape[-2:]
    # 将掩膜重塑为(h, w, 1)形状，以便于与颜色数组相乘，准备作为图像显示
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    # 从coords中筛选出前景点(pos_points)和背景点(neg_points)
    # 如果labels中的元素为1，则对应的坐标点被视为前景；如果为0，则视为背景
    pos_points = coords[labels == 1]
    neg_points = coords[labels == 0]
    # 绘制前景点和背景点
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 前景的标记点显示
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 背景的标记点显示
import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor

image = cv2.imread('img.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#------加载模型
# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"
# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"
device = "cuda"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
predictor = SamPredictor(sam)
predictor.set_image(image)

#------加载模型---------------------

# 鼠标标定(x,y)位置
# 因为可以有多个标定，所以有多个坐标点
input_point = np.array([[230, 194], [182, 63], [339, 279]])
# 1表示前景目标，0表示背景
# input_point和input_label一一对应
input_label = np.array([1, 1, 0])


masks, scores, logits = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    multimask_output=True,
)
for i, (mask, score) in enumerate(zip(masks, scores)):
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    show_mask(mask, plt.gca())
    show_points(input_point, input_label, plt.gca())
    plt.title(f"Mask {i + 1}, Score: {score:.3f}", fontsize=18)
    plt.axis('off')
    plt.show()

这里会输出三个结果。

步骤四：标定框完成前景目标的分割

绿色的框是用户自己标定的，根据框选的区域完成前景目标的分割。

import numpy as np
import matplotlib.pyplot as plt
import cv2

def show_mask(mask, ax, random_color=False):
    if random_color:    # 掩膜颜色是否随机决定
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    # 筛选出前景目标标记点
    pos_points = coords[labels == 1]
    # 筛选出背景目标标记点
    neg_points = coords[labels == 0]
    # x-->pos_points[:, 0] y-->pos_points[:, 1]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 前景的标记点显示
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 背景的标记点显示
def show_box(box, ax):
    # 画出标定框 x0 y0是起始坐标
    x0, y0 = box[0], box[1]
    # w h 是框的尺寸
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor
image = cv2.imread('img.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#------加载模型
# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"
# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"
device = "cuda"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
predictor = SamPredictor(sam)
predictor.set_image(image)

#------加载模型----------------

# 标定框的起始坐标和终点坐标
input_box = np.array([112, 41, 373, 320])

masks, _, _ = predictor.predict(
    point_coords=None,
    point_labels=None,
    box=input_box[None, :],
    multimask_output=False,
)

plt.figure(figsize=(10, 10))
plt.imshow(image)
show_mask(masks[0], plt.gca())
show_box(input_box, plt.gca())
plt.axis('off')
plt.show()

步骤五：标定框和标记点联合完成前景目标的分割

对于一些复杂的目标，可能需要联合使用提高前景目标的分割精度。box和points可以联合标定完成图像分割的，但是此时的box只能有一个，不能有多个。

import numpy as np
import matplotlib.pyplot as plt
import cv2

def show_mask(mask, ax, random_color=False):
    if random_color:    # 掩膜颜色是否随机决定
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    # 筛选出前景目标标记点
    pos_points = coords[labels == 1]
    # 筛选出背景目标标记点
    neg_points = coords[labels == 0]
    # x-->pos_points[:, 0] y-->pos_points[:, 1]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 前景的标记点显示
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 背景的标记点显示

def show_box(box, ax):
    # 画出标定框 x0 y0是起始坐标
    x0, y0 = box[0], box[1]
    # w h 是框的尺寸
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor

image = cv2.imread('img.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#------加载模型
# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"
# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"
device = "cuda"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
predictor = SamPredictor(sam)
predictor.set_image(image)
#------加载模型


# 标定框的起始坐标和终点坐标
input_box = np.array([112, 41, 373, 320])
# 鼠标标定(x,y)位置
# 因为可以有多个标定，所以有多个坐标点
input_point = np.array([[230, 194], [182, 63], [339, 279]])
# 1表示前景目标，0表示背景
# input_point和input_label一一对应
input_label = np.array([1, 1, 0])

# 标定框和标记点联合使用
masks, _, _ = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    box=input_box,
    multimask_output=False,
)

plt.figure(figsize=(10, 10))
plt.imshow(image)
show_mask(masks[0], plt.gca())
show_box(input_box, plt.gca())
show_points(input_point, input_label, plt.gca())
plt.axis('off')
plt.show()

步骤六：多标定框完成前景目标的分割

可以是多标定框对应多个目标，也可以是多标定框对应同一目标的不同部位。

import numpy as np
import matplotlib.pyplot as plt
import torch
import cv2

def show_mask(mask, ax, random_color=False):
    if random_color:    # 掩膜颜色是否随机决定
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    # 筛选出前景目标标记点
    pos_points = coords[labels == 1]
    # 筛选出背景目标标记点
    neg_points = coords[labels == 0]
    # x-->pos_points[:, 0] y-->pos_points[:, 1]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 前景的标记点显示
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 背景的标记点显示

def show_box(box, ax):
    # 画出标定框 x0 y0是起始坐标
    x0, y0 = box[0], box[1]
    # w h 是框的尺寸
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor

image = cv2.imread('img.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#------加载模型
# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"
# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"
device = "cuda"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
predictor = SamPredictor(sam)
predictor.set_image(image)
#------加载模型


# 存在多个目标标定框
input_boxes = torch.tensor([
    [121, 49, 361, 190],
    [143, 101, 308, 312],
    [366, 116, 451, 233],
], device=predictor.device)

transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, image.shape[:2])
masks, _, _ = predictor.predict_torch(
    point_coords=None,
    point_labels=None,
    boxes=transformed_boxes,
    multimask_output=False,
)
plt.figure(figsize=(10, 10))
plt.imshow(image)
for mask in masks:
    show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
for box in input_boxes:
    show_box(box.cpu().numpy(), plt.gca())
plt.axis('off')
plt.show()

步骤七：图片批量完成前景目标的分割

源码支持图片的批量输入，大大提升了分割效率。

import numpy as np
import matplotlib.pyplot as plt
import torch
import cv2

def show_mask(mask, ax, random_color=False):
    if random_color:    # 掩膜颜色是否随机决定
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    # 筛选出前景目标标记点
    pos_points = coords[labels == 1]
    # 筛选出背景目标标记点
    neg_points = coords[labels == 0]
    # x-->pos_points[:, 0] y-->pos_points[:, 1]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 前景的标记点显示
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)  # 背景的标记点显示

def show_box(box, ax):
    # 画出标定框 x0 y0是起始坐标
    x0, y0 = box[0], box[1]
    # w h 是框的尺寸
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

def prepare_image(image, transform, device):
    image = transform.apply_image(image)
    image = torch.as_tensor(image, device=device.device)
    return image.permute(2, 0, 1).contiguous()


import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor

image1 = cv2.imread('img.png')
image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
image2 = cv2.imread('img_1.png')
image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)

#------加载模型
# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"
# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"
device = "cuda"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
from segment_anything.utils.transforms import ResizeLongestSide
resize_transform = ResizeLongestSide(sam.image_encoder.img_size)
#------加载模型

# 存在多个目标标定框
image1_boxes = torch.tensor([
    [121, 49, 361, 190],
    [143, 101, 308, 312],
    [366, 116, 451, 233],
], device=sam.device)

image2_boxes = torch.tensor([
    [24, 4, 333, 265],
], device=sam.device)

# 批量输入
batched_input = [
     {
         'image': prepare_image(image1, resize_transform, sam),
         'boxes': resize_transform.apply_boxes_torch(image1_boxes, image1.shape[:2]),
         'original_size': image1.shape[:2]
     },
     {
         'image': prepare_image(image2, resize_transform, sam),
         'boxes': resize_transform.apply_boxes_torch(image2_boxes, image2.shape[:2]),
         'original_size': image2.shape[:2]
     }
]
batched_output = sam(batched_input, multimask_output=False)

fig, ax = plt.subplots(1, 2, figsize=(20, 20))

# 批量输出
ax[0].imshow(image1)
for mask in batched_output[0]['masks']:
    show_mask(mask.cpu().numpy(), ax[0], random_color=True)
for box in image1_boxes:
    show_box(box.cpu().numpy(), ax[0])
ax[0].axis('off')

ax[1].imshow(image2)
for mask in batched_output[1]['masks']:
    show_mask(mask.cpu().numpy(), ax[1], random_color=True)
for box in image2_boxes:
    show_box(box.cpu().numpy(), ax[1])
ax[1].axis('off')
plt.tight_layout()
plt.show()

2.automatic_mask_generator_example

源码在notebooks文件内提供了一个Jupyter Notebook的自动分割教程，无需标定点和标定框。

步骤一：自动掩码生成

import numpy as np
import torch
import matplotlib.pyplot as plt
import cv2

image = cv2.imread('img.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"
# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"
device = "cuda"

def show_anns(anns):
    if len(anns) == 0:
        return
    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
    ax = plt.gca()
    ax.set_autoscale_on(False)
    polygons = []
    color = []
    for ann in sorted_anns:
        m = ann['segmentation']
        img = np.ones((m.shape[0], m.shape[1], 3))
        color_mask = np.random.random((1, 3)).tolist()[0]   # 产生随机颜色的mask
        for i in range(3):
            img[:, :, i] = color_mask[i]
        ax.imshow(np.dstack((img, m*0.35)))

from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)

mask_generator = SamAutomaticMaskGenerator(sam)
masks = mask_generator.generate(image)

plt.figure(figsize=(20, 20))
plt.imshow(image)
show_anns(masks)
plt.axis('off')
plt.show()

在自动掩模生成中有几个可调参数，用于控制采样点的密度以及去除低质量或重复掩模的阈值。此外，生成可以在图像的裁剪上自动运行，以提高较小对象的性能，后处理可以去除杂散像素和孔洞。

import numpy as np
import torch
import matplotlib.pyplot as plt
import cv2

# 读取图片文件
image = cv2.imread('img.png')
# 将图像从OpenCV默认的BGR格式转换为RGB格式
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# 权重文件保存地址
sam_checkpoint = "model_save/sam_vit_b_01ec64.pth"
# sam_checkpoint = "model_save/sam_vit_h_4b8939.pth"
# sam_checkpoint = "model_save/sam_vit_l_0b3195.pth"

# 模型类型
model_type = "vit_b"
# model_type = "vit_h"
# model_type = "vit_l"

device = "cuda"

# 掩膜显示函数
def show_anns(anns):
    # 检查传入的注释列表anns的长度
    # 如果列表为空（即没有注释），函数直接返回，不执行后续操作
    if len(anns) == 0:
        return
    # 使用sorted()函数按照每个注释的面积(x['area'])对anns进行降序排序
    # 这样可以确保绘制时较大的对象先被绘制，较小的对象后绘制，避免被遮挡
    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
    # 获取当前的matplotlib坐标轴对象ax
    ax = plt.gca()
    # 关闭坐标轴的自动缩放功能
    ax.set_autoscale_on(False)
    polygons = []
    color = []
    # 循环处理每个注释
    for ann in sorted_anns:
        # 获取注释的掩模信息
        m = ann['segmentation']
        # 创建一个与掩模形状相同的全白色图像img，尺寸为(m的高度, m的宽度, 3通道)，用于混合颜色
        img = np.ones((m.shape[0], m.shape[1], 3))
        # 生成一个随机颜色color_mask，并将其应用于img的每个通道，使得每个掩模都拥有独一无二的颜色
        color_mask = np.random.random((1, 3)).tolist()[0]   # 产生随机颜色的mask
        # 将随机颜色图像img与透明度调整过的掩模m*0.35垂直堆叠，实现颜色遮罩效果
        for i in range(3):
            img[:, :, i] = color_mask[i]
        # 将堆叠后的图像通过ax.imshow()方法显示在当前坐标轴上，逐步叠加各个掩模
        ax.imshow(np.dstack((img, m*0.35)))

from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
# 根据选定的model_type实例化SAM模型，并从指定的检查点加载
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)

# 默认版本
# mask_generator = SamAutomaticMaskGenerator(sam)
# 自定义参数版本

# 掩模生成
mask_generator_2 = SamAutomaticMaskGenerator(
    model=sam,
    points_per_side=32,
    pred_iou_thresh=0.86,
    stability_score_thresh=0.92,
    crop_n_layers=1,
    crop_n_points_downscale_factor=2,
    min_mask_region_area=100,  # Requires open-cv to run post-processing
)

masks = mask_generator_2.generate(image)
# 掩模可视化
plt.figure(figsize=(20, 20))
plt.imshow(image)
show_anns(masks)
plt.axis('off')
plt.show()