overlayfs

OverlayFS is a modern union filesystem that is similar to AUFS, but faster and with a simpler implementation. Docker provides two storage drivers for OverlayFS: the original overlay, and the newer and more stable overlay2.

预备知识

https://docs.docker.com/storage/storagedriver/overlayfs-driver/

overlay vs overlay2

If you are still using the overlay driver rather than overlay2, see How the overlay driver works instead.

OverlayFS layers two directories on a single Linux host and presents them as a single directory. These directories are called layers and the unification process is referred to as a union mount. OverlayFS refers to the lower directory as lowerdir and the upper directory a upperdir. The unified view is exposed through its own directory called merged.

While the overlay driver only works with a single lower OverlayFS layer and hence requires hard links for implementation of multi-layered images, the overlay2 driver natively supports up to 128 lower OverlayFS layers. This capability provides better performance for layer-related Docker commands such as docker build and docker commit, and consumes fewer inodes on the backing filesystem.

overlay1存在的问题:

While the overlay driver only works with a single lower OverlayFS layer and hence requires hard links for implementation of multi-layered images, the overlay2 driver natively supports up to 128 lower OverlayFS layers. This capability provides better performance for layer-related Docker commands such as docker build and docker commit, and consumes fewer inodes on the backing filesystem.

如果还是在同一个文件系统的时候,比如pull镜像的时候是可以使用hard-link的方式来引用低层的镜像层的数据的。

To create a container, the overlay driver combines the directory representing the image’s top layer plus a new directory for the container. The image’s top layer is the lowerdir in the overlay and is read-only. The new directory for the container is the upperdir and is writable.

moby overlay的代码:moby/moby/daemon/graphdriver/overlay

  • Create()方法也就是产生rootfs的过程中都是copy.DirCopy(parentUpperDir, upperDir, copy.Content, true)也就是拷贝的内容(将parentUpperDir的内容拷贝到upperDir,因为只支持一层lowerDir,同时不能将parentUpperDir hard-link到upperDir目录,因为upper层是可读写的,hard-link也会破坏原有的parent)
  • ApplyDiff()也就是构建镜像过程中调用的函数都是使用的hard-link的方式copy.DirCopy(parentRootDir, tmpRootDir, copy.Hardlink, true)

docker commit过程需要Create() -> Diff() -> ApplyDiff()的过程,因为Create()会拷贝文件内容,因此会消耗多余的inode。

具体实现

func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
    return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
}

func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) {
    ctx, t, err := o.ms.TransactionContext(ctx, true)
    if err != nil {
        return nil, err
    }

    var td, path string
    defer func() {
        if err != nil {
            if td != "" {
                if err1 := os.RemoveAll(td); err1 != nil {
                    log.G(ctx).WithError(err1).Warn("failed to cleanup temp snapshot directory")
                }
            }
            if path != "" {
                if err1 := os.RemoveAll(path); err1 != nil {
                    log.G(ctx).WithError(err1).WithField("path", path).Error("failed to reclaim snapshot directory, directory may need removal")
                    err = errors.Wrapf(err, "failed to remove path: %v", err1)
                }
            }
        }
    }()

    snapshotDir := filepath.Join(o.root, "snapshots")
    td, err = o.prepareDirectory(ctx, snapshotDir, kind)
    if err != nil {
        if rerr := t.Rollback(); rerr != nil {
            log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
        }
        return nil, errors.Wrap(err, "failed to create prepare snapshot dir")
    }
    rollback := true
    defer func() {
        if rollback {
            if rerr := t.Rollback(); rerr != nil {
                log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
            }
        }
    }()

    s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...)
    if err != nil {
        return nil, errors.Wrap(err, "failed to create snapshot")
    }

    if len(s.ParentIDs) > 0 {
        st, err := os.Stat(o.upperPath(s.ParentIDs[0]))
        if err != nil {
            return nil, errors.Wrap(err, "failed to stat parent")
        }

        stat := st.Sys().(*syscall.Stat_t)

        // 设置目录权限和parent的相同
        if err := os.Lchown(filepath.Join(td, "fs"), int(stat.Uid), int(stat.Gid)); err != nil {
            if rerr := t.Rollback(); rerr != nil {
                log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
            }
            return nil, errors.Wrap(err, "failed to chown")
        }
    }

    path = filepath.Join(snapshotDir, s.ID)
    if err = os.Rename(td, path); err != nil {
        return nil, errors.Wrap(err, "failed to rename")
    }
    td = ""

    // 这里不太理解为什么rollback=false? 
    rollback = false
    if err = t.Commit(); err != nil {
        return nil, errors.Wrap(err, "commit failed")
    }

    // merge目录不由snapshotter决定,挂载在哪,哪就是merge层
    return o.mounts(s), nil
}

func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount {
    if len(s.ParentIDs) == 0 {
        // if we only have one layer/no parents then just return a bind mount as overlay
        // will not work
        roFlag := "rw"
        if s.Kind == snapshots.KindView {
            roFlag = "ro"
        }

        return []mount.Mount{
            {
                Source: o.upperPath(s.ID),
                Type:   "bind",
                Options: []string{
                    roFlag,
                    "rbind",
                },
            },
        }
    }
    var options []string

    if s.Kind == snapshots.KindActive {
        options = append(options,
            // filepath.Join(o.root, "snapshots", id, "work")
            // 文件系统挂载后用于存放临时和间接文件的工作基目录
            fmt.Sprintf("workdir=%s", o.workPath(s.ID)),
            // upper目录,也就是文件系统存储的主目录,可以认为是container的RW层
            // filepath.Join(o.root, "snapshots", id, "fs")
            fmt.Sprintf("upperdir=%s", o.upperPath(s.ID)),
        )
    } else if len(s.ParentIDs) == 1 {
        // 只有一层且只是返回可读层的时候上边注释也有说明,直接返回父亲的bind mount即可
        return []mount.Mount{
            {
                Source: o.upperPath(s.ParentIDs[0]),
                Type:   "bind",
                Options: []string{
                    "ro",
                    "rbind",
                },
            },
        }
    }

    // 不论是返回读写层都需要把lowerdir给放到options中去
    parentPaths := make([]string, len(s.ParentIDs))
    for i := range s.ParentIDs {
        parentPaths[i] = o.upperPath(s.ParentIDs[i])
    }

    options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":")))
    // 具体如何处理由mount包处理,参考containerd/containerd/sys/mount_linux.go
    return []mount.Mount{
        {
            Type:    "overlay",
            Source:  "overlay",
            Options: options,
        },
    }

}

// 和native的一样的,没什么好说的
func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
    ctx, t, err := o.ms.TransactionContext(ctx, true)
    if err != nil {
        return err
    }

    defer func() {
        if err != nil {
            if rerr := t.Rollback(); rerr != nil {
                log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
            }
        }
    }()

    // grab the existing id
    id, _, _, err := storage.GetInfo(ctx, key)
    if err != nil {
        return err
    }

    usage, err := fs.DiskUsage(ctx, o.upperPath(id))
    if err != nil {
        return err
    }

    if _, err = storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil {
        return errors.Wrap(err, "failed to commit snapshot")
    }
    return t.Commit()
}

results matching ""

    No results matching ""