hcloud-upload-image/hcloudimages/client.go

551 lines
17 KiB
Go
Raw Normal View History

2024-05-02 21:42:36 +02:00
package hcloudimages
2024-04-29 21:00:04 +02:00
import (
"context"
"errors"
2024-04-29 21:00:04 +02:00
"fmt"
"io"
"log/slog"
"net/url"
2024-04-29 21:00:04 +02:00
"time"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/hetznercloud/hcloud-go/v2/hcloud/exp/kit/sshutil"
2024-04-29 21:00:04 +02:00
"golang.org/x/crypto/ssh"
2024-05-02 21:42:36 +02:00
"github.com/apricote/hcloud-upload-image/hcloudimages/contextlogger"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/actionutil"
2024-05-02 21:42:36 +02:00
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/control"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/labelutil"
2024-05-02 21:42:36 +02:00
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/randomid"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/sshsession"
2024-04-29 21:00:04 +02:00
)
const (
CreatedByLabel = "apricote.de/created-by"
CreatedByValue = "hcloud-upload-image"
resourcePrefix = "hcloud-upload-image-"
)
var (
DefaultLabels = map[string]string{
CreatedByLabel: CreatedByValue,
}
serverTypePerArchitecture = map[hcloud.Architecture]*hcloud.ServerType{
hcloud.ArchitectureX86: {Name: "cx22"},
2024-04-29 21:00:04 +02:00
hcloud.ArchitectureARM: {Name: "cax11"},
}
defaultImage = &hcloud.Image{Name: "ubuntu-24.04"}
2024-04-29 21:00:04 +02:00
defaultLocation = &hcloud.Location{Name: "fsn1"}
defaultRescueType = hcloud.ServerRescueTypeLinux64
defaultSSHDialTimeout = 1 * time.Minute
// Size observed on x86, 2025-05-03, no idea if that changes.
// Might be able to extends this to more of the available memory.
rescueSystemRootDiskSizeMB int64 = 960
2024-04-29 21:00:04 +02:00
)
type UploadOptions struct {
// ImageURL must be publicly available. The instance will download the image from this endpoint.
ImageURL *url.URL
// ImageReader
ImageReader io.Reader
// ImageCompression describes the compression of the referenced image file. It defaults to [CompressionNone]. If
// set to anything else, the file will be decompressed before written to the disk.
ImageCompression Compression
ImageFormat Format
// Can be optionally set to make the client validate that the image can be written to the server.
ImageSize int64
// Possible future additions:
// ImageSignatureVerification
// ImageLocalPath
// Architecture should match the architecture of the Image. This decides if the Snapshot can later be
// used with [hcloud.ArchitectureX86] or [hcloud.ArchitectureARM] servers.
//
// Internally this decides what server type is used for the temporary server.
//
// Optional if [UploadOptions.ServerType] is set.
Architecture hcloud.Architecture
// ServerType can be optionally set to override the default server type for the architecture.
// Situations where this makes sense:
//
// - Your image is larger than the root disk of the default server types.
// - The default server type is no longer available, or not temporarily out of stock.
ServerType *hcloud.ServerType
// Description is an optional description that the resulting image (snapshot) will have. There is no way to
// select images by its description, you should use Labels if you need to identify your image later.
Description *string
// Labels will be added to the resulting image (snapshot). Use these to filter the image list if you
// need to identify the image later on.
//
// We also always add a label `apricote.de/created-by=hcloud-image-upload` ([CreatedByLabel], [CreatedByValue]).
Labels map[string]string
// DebugSkipResourceCleanup will skip the cleanup of the temporary SSH Key and Server.
DebugSkipResourceCleanup bool
}
type Compression string
const (
CompressionNone Compression = ""
CompressionBZ2 Compression = "bz2"
CompressionXZ Compression = "xz"
CompressionZSTD Compression = "zstd"
// Possible future additions:
// zip
)
type Format string
const (
FormatRaw Format = ""
// FormatQCOW2 allows to upload images in the qcow2 format directly.
//
// The qcow2 image must fit on the disk available in the rescue system. "qemu-img dd", which is used to convert
// qcow2 to raw, requires a file as an input. If [UploadOption.ImageSize] is set and FormatQCOW2 is used, there is a
// warning message displayed if there is a high probability of issues.
FormatQCOW2 Format = "qcow2"
)
// NewClient instantiates a new client. It requires a working [*hcloud.Client] to interact with the Hetzner Cloud API.
func NewClient(c *hcloud.Client) *Client {
return &Client{
2024-05-02 21:42:36 +02:00
c: c,
2024-04-29 21:00:04 +02:00
}
}
type Client struct {
2024-05-02 21:42:36 +02:00
c *hcloud.Client
2024-04-29 21:00:04 +02:00
}
// Upload the specified image into a snapshot on Hetzner Cloud.
//
// As the Hetzner Cloud API has no direct way to upload images, we create a temporary server,
// overwrite the root disk and take a snapshot of that disk instead.
//
// The temporary server costs money. If the upload fails, we might be unable to delete the server. Check out
// CleanupTempResources for a helper in this case.
func (s *Client) Upload(ctx context.Context, options UploadOptions) (*hcloud.Image, error) {
2024-04-30 23:48:59 +02:00
logger := contextlogger.From(ctx).With(
"library", "hcloudimages",
2024-04-30 23:48:59 +02:00
"method", "upload",
)
2024-04-29 21:00:04 +02:00
id, err := randomid.Generate()
if err != nil {
return nil, err
}
2024-04-30 23:48:59 +02:00
logger = logger.With("run-id", id)
2024-04-29 21:00:04 +02:00
// For simplicity, we use the name random name for SSH Key + Server
resourceName := resourcePrefix + id
labels := labelutil.Merge(DefaultLabels, options.Labels)
2024-04-29 21:00:04 +02:00
// 0. Validations
if options.ImageFormat == FormatQCOW2 && options.ImageSize > 0 {
if options.ImageSize > rescueSystemRootDiskSizeMB*1024*1024 {
// Just a warning, because the size might change with time.
// Alternatively one could add an override flag for the check and make this an error.
logger.WarnContext(ctx,
fmt.Sprintf("image must be smaller than %d MB (rescue system root disk) for qcow2", rescueSystemRootDiskSizeMB),
"maximum-size", rescueSystemRootDiskSizeMB,
"actual-size", options.ImageSize/(1024*1024),
)
}
}
2024-04-29 21:00:04 +02:00
// 1. Create SSH Key
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "# Step 1: Generating SSH Key")
privateKey, publicKey, err := sshutil.GenerateKeyPair()
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("failed to generate temporary ssh key pair: %w", err)
}
2024-05-02 21:42:36 +02:00
key, _, err := s.c.SSHKey.Create(ctx, hcloud.SSHKeyCreateOpts{
2024-04-29 21:00:04 +02:00
Name: resourceName,
PublicKey: string(publicKey),
Labels: labels,
2024-04-29 21:00:04 +02:00
})
if err != nil {
return nil, fmt.Errorf("failed to submit temporary ssh key to API: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "Uploaded ssh key", "ssh-key-id", key.ID)
2024-04-29 21:00:04 +02:00
defer func() {
// Cleanup SSH Key
if options.DebugSkipResourceCleanup {
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "Cleanup: Skipping cleanup of temporary ssh key")
2024-04-29 21:00:04 +02:00
return
}
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "Cleanup: Deleting temporary ssh key")
2024-05-02 21:42:36 +02:00
_, err := s.c.SSHKey.Delete(ctx, key)
2024-04-29 21:00:04 +02:00
if err != nil {
2024-04-30 23:48:59 +02:00
logger.WarnContext(ctx, "Cleanup: ssh key could not be deleted", "error", err)
2024-04-29 21:00:04 +02:00
// TODO
}
}()
// 2. Create Server
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "# Step 2: Creating Server")
var serverType *hcloud.ServerType
if options.ServerType != nil {
serverType = options.ServerType
} else {
var ok bool
serverType, ok = serverTypePerArchitecture[options.Architecture]
if !ok {
return nil, fmt.Errorf("unknown architecture %q, valid options: %q, %q", options.Architecture, hcloud.ArchitectureX86, hcloud.ArchitectureARM)
}
2024-04-29 21:00:04 +02:00
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "creating server with config",
"image", defaultImage.Name,
"location", defaultLocation.Name,
"serverType", serverType.Name,
)
2024-05-02 21:42:36 +02:00
serverCreateResult, _, err := s.c.Server.Create(ctx, hcloud.ServerCreateOpts{
2024-04-29 21:00:04 +02:00
Name: resourceName,
ServerType: serverType,
// Not used, but without this the user receives an email with a password for every created server
SSHKeys: []*hcloud.SSHKey{key},
// We need to enable rescue system first
StartAfterCreate: hcloud.Ptr(false),
// Image will never be booted, we only boot into rescue system
Image: defaultImage,
Location: defaultLocation,
Labels: labels,
2024-04-29 21:00:04 +02:00
})
if err != nil {
return nil, fmt.Errorf("creating the temporary server failed: %w", err)
}
2024-04-30 23:48:59 +02:00
logger = logger.With("server", serverCreateResult.Server.ID)
logger.DebugContext(ctx, "Created Server")
2024-04-29 21:00:04 +02:00
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "waiting on actions")
2024-05-02 21:42:36 +02:00
err = s.c.Action.WaitFor(ctx, append(serverCreateResult.NextActions, serverCreateResult.Action)...)
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("creating the temporary server failed: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "actions finished")
2024-04-29 21:00:04 +02:00
server := serverCreateResult.Server
defer func() {
// Cleanup Server
if options.DebugSkipResourceCleanup {
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "Cleanup: Skipping cleanup of temporary server")
2024-04-29 21:00:04 +02:00
return
}
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "Cleanup: Deleting temporary server")
2024-05-02 21:42:36 +02:00
_, _, err := s.c.Server.DeleteWithResult(ctx, server)
2024-04-29 21:00:04 +02:00
if err != nil {
2024-04-30 23:48:59 +02:00
logger.WarnContext(ctx, "Cleanup: server could not be deleted", "error", err)
2024-04-29 21:00:04 +02:00
}
}()
// 3. Activate Rescue System
logger.InfoContext(ctx, "# Step 3: Activating Rescue System")
2024-05-02 21:42:36 +02:00
enableRescueResult, _, err := s.c.Server.EnableRescue(ctx, server, hcloud.ServerEnableRescueOpts{
2024-04-29 21:00:04 +02:00
Type: defaultRescueType,
SSHKeys: []*hcloud.SSHKey{key},
})
if err != nil {
return nil, fmt.Errorf("enabling the rescue system on the temporary server failed: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "rescue system requested, waiting on action")
2024-05-02 21:42:36 +02:00
err = s.c.Action.WaitFor(ctx, enableRescueResult.Action)
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("enabling the rescue system on the temporary server failed: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "action finished, rescue system enabled")
2024-04-29 21:00:04 +02:00
// 4. Boot Server
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "# Step 4: Booting Server")
2024-05-02 21:42:36 +02:00
powerOnAction, _, err := s.c.Server.Poweron(ctx, server)
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("starting the temporary server failed: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "boot requested, waiting on action")
2024-05-02 21:42:36 +02:00
err = s.c.Action.WaitFor(ctx, powerOnAction)
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("starting the temporary server failed: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "action finished, server is booting")
2024-04-29 21:00:04 +02:00
// 5. Open SSH Session
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "# Step 5: Opening SSH Connection")
2024-04-29 21:00:04 +02:00
signer, err := ssh.ParsePrivateKey(privateKey)
if err != nil {
return nil, fmt.Errorf("parsing the automatically generated temporary private key failed: %w", err)
}
sshClientConfig := &ssh.ClientConfig{
User: "root",
Auth: []ssh.AuthMethod{
ssh.PublicKeys(signer),
},
// There is no way to get the host key of the rescue system beforehand
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
Timeout: defaultSSHDialTimeout,
}
// the server needs some time until its properly started and ssh is available
var sshClient *ssh.Client
2024-04-30 23:48:59 +02:00
err = control.Retry(
contextlogger.New(ctx, logger.With("operation", "ssh")),
100, // ~ 3 minutes
2024-04-30 23:48:59 +02:00
func() error {
var err error
logger.DebugContext(ctx, "trying to connect to server", "ip", server.PublicNet.IPv4.IP)
sshClient, err = ssh.Dial("tcp", server.PublicNet.IPv4.IP.String()+":ssh", sshClientConfig)
return err
},
)
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("failed to ssh into temporary server: %w", err)
}
defer func() { _ = sshClient.Close() }()
2024-04-29 21:00:04 +02:00
// 6. Wipe existing disk, to avoid storing any bytes from it in the snapshot
logger.InfoContext(ctx, "# Step 6: Cleaning existing disk")
output, err := sshsession.Run(sshClient, "blkdiscard /dev/sda", nil)
logger.DebugContext(ctx, string(output))
if err != nil {
return nil, fmt.Errorf("failed to clean existing disk: %w", err)
}
// 7. SSH On Server: Download Image, Decompress, Write to Root Disk
logger.InfoContext(ctx, "# Step 7: Downloading image and writing to disk")
cmd, err := assembleCommand(options)
if err != nil {
return nil, err
}
logger.DebugContext(ctx, "running download, decompress and write to disk command", "cmd", cmd)
2024-04-30 23:48:59 +02:00
output, err = sshsession.Run(sshClient, cmd, options.ImageReader)
logger.InfoContext(ctx, "# Step 7: Finished writing image to disk")
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, string(output))
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("failed to download and write the image: %w", err)
}
// 8. SSH On Server: Shutdown
logger.InfoContext(ctx, "# Step 8: Shutting down server")
_, err = sshsession.Run(sshClient, "shutdown now", nil)
2024-04-29 21:00:04 +02:00
if err != nil {
// TODO Verify if shutdown error, otherwise return
2024-04-30 23:48:59 +02:00
logger.WarnContext(ctx, "shutdown returned error", "err", err)
2024-04-29 21:00:04 +02:00
}
// 9. Create Image from Server
logger.InfoContext(ctx, "# Step 9: Creating Image")
2024-05-02 21:42:36 +02:00
createImageResult, _, err := s.c.Server.CreateImage(ctx, server, &hcloud.ServerCreateImageOpts{
2024-04-29 21:00:04 +02:00
Type: hcloud.ImageTypeSnapshot,
Description: options.Description,
Labels: labels,
2024-04-29 21:00:04 +02:00
})
if err != nil {
return nil, fmt.Errorf("failed to create snapshot: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "image creation requested, waiting on action")
2024-05-02 21:42:36 +02:00
err = s.c.Action.WaitFor(ctx, createImageResult.Action)
2024-04-29 21:00:04 +02:00
if err != nil {
return nil, fmt.Errorf("failed to create snapshot: %w", err)
}
2024-04-30 23:48:59 +02:00
logger.DebugContext(ctx, "action finished, image was created")
2024-04-29 21:00:04 +02:00
image := createImageResult.Image
2024-04-30 23:48:59 +02:00
logger.InfoContext(ctx, "# Image was created", "image", image.ID)
2024-04-29 21:00:04 +02:00
// Resource cleanup is happening in `defer`
return image, nil
}
// CleanupTempResources tries to delete any resources that were left over from previous calls to [Client.Upload].
// Upload tries to clean up any temporary resources it created at runtime, but might fail at any point.
// You can then use this command to make sure that all temporary resources are removed from your project.
//
// This method tries to delete any server or ssh keys that match the [DefaultLabels]
func (s *Client) CleanupTempResources(ctx context.Context) error {
logger := contextlogger.From(ctx).With(
"library", "hcloudimages",
"method", "cleanup",
)
selector := labelutil.Selector(DefaultLabels)
logger = logger.With("selector", selector)
logger.InfoContext(ctx, "# Cleaning up Servers")
err := s.cleanupTempServers(ctx, logger, selector)
if err != nil {
return fmt.Errorf("failed to clean up all servers: %w", err)
2024-04-29 21:00:04 +02:00
}
logger.DebugContext(ctx, "cleaned up all servers")
logger.InfoContext(ctx, "# Cleaning up SSH Keys")
err = s.cleanupTempSSHKeys(ctx, logger, selector)
if err != nil {
return fmt.Errorf("failed to clean up all ssh keys: %w", err)
}
logger.DebugContext(ctx, "cleaned up all ssh keys")
return nil
}
func (s *Client) cleanupTempServers(ctx context.Context, logger *slog.Logger, selector string) error {
servers, err := s.c.Server.AllWithOpts(ctx, hcloud.ServerListOpts{ListOpts: hcloud.ListOpts{
LabelSelector: selector,
}})
if err != nil {
return fmt.Errorf("failed to list servers: %w", err)
}
if len(servers) == 0 {
logger.InfoContext(ctx, "No servers found")
return nil
}
logger.InfoContext(ctx, "removing servers", "count", len(servers))
errs := []error{}
actions := make([]*hcloud.Action, 0, len(servers))
for _, server := range servers {
result, _, err := s.c.Server.DeleteWithResult(ctx, server)
if err != nil {
errs = append(errs, err)
logger.WarnContext(ctx, "failed to delete server", "server", server.ID, "error", err)
continue
}
actions = append(actions, result.Action)
}
successActions, errorActions, err := actionutil.Settle(ctx, &s.c.Action, actions...)
if err != nil {
return fmt.Errorf("failed to wait for server delete: %w", err)
}
if len(successActions) > 0 {
ids := make([]int64, 0, len(successActions))
for _, action := range successActions {
for _, resource := range action.Resources {
if resource.Type == hcloud.ActionResourceTypeServer {
ids = append(ids, resource.ID)
}
}
}
logger.InfoContext(ctx, "successfully deleted servers", "servers", ids)
}
if len(errorActions) > 0 {
for _, action := range errorActions {
errs = append(errs, action.Error())
}
}
if len(errs) > 0 {
// The returned message contains no info about the server IDs which failed
return fmt.Errorf("failed to delete some of the servers: %w", errors.Join(errs...))
}
return nil
}
func (s *Client) cleanupTempSSHKeys(ctx context.Context, logger *slog.Logger, selector string) error {
keys, _, err := s.c.SSHKey.List(ctx, hcloud.SSHKeyListOpts{ListOpts: hcloud.ListOpts{
LabelSelector: selector,
}})
if err != nil {
return fmt.Errorf("failed to list keys: %w", err)
}
if len(keys) == 0 {
logger.InfoContext(ctx, "No ssh keys found")
return nil
}
errs := []error{}
for _, key := range keys {
_, err := s.c.SSHKey.Delete(ctx, key)
if err != nil {
errs = append(errs, err)
logger.WarnContext(ctx, "failed to delete ssh key", "ssh-key", key.ID, "error", err)
continue
}
}
if len(errs) > 0 {
// The returned message contains no info about the server IDs which failed
return fmt.Errorf("failed to delete some of the ssh keys: %w", errors.Join(errs...))
2024-04-29 21:00:04 +02:00
}
return nil
2024-04-29 21:00:04 +02:00
}
func assembleCommand(options UploadOptions) (string, error) {
// Make sure that we fail early, ie. if the image url does not work
cmd := "set -euo pipefail && "
if options.ImageURL != nil {
cmd += fmt.Sprintf("wget --no-verbose -O - %q | ", options.ImageURL.String())
}
if options.ImageCompression != CompressionNone {
switch options.ImageCompression {
case CompressionBZ2:
cmd += "bzip2 -cd | "
case CompressionXZ:
cmd += "xz -cd | "
case CompressionZSTD:
cmd += "zstd -cd | "
default:
return "", fmt.Errorf("unknown compression: %q", options.ImageCompression)
}
}
switch options.ImageFormat {
case FormatRaw:
cmd += "dd of=/dev/sda bs=4M"
case FormatQCOW2:
cmd += "tee image.qcow2 > /dev/null && qemu-img dd -f qcow2 -O raw if=image.qcow2 of=/dev/sda bs=4M"
default:
return "", fmt.Errorf("unknown format: %q", options.ImageFormat)
}
cmd += " && sync"
// the pipefail does not work correctly without wrapping in bash.
cmd = fmt.Sprintf("bash -c '%s'", cmd)
return cmd, nil
}