fix: solve network disruption during downloads, add OLLAMA_DOWNLOAD_C…

…ONN setting The Ollama server now downloads models using a single connection. This change addresses the root cause of issue ollama#2006 by following best practices instead of relying on workarounds. Users have been reporting problems associated with model downloads since January 2024, describing issues such as "hogging the entire device", "reliably and repeatedly kills my connection", "freezes completely leaving no choice but to hard reset", "when I download models, everyone in the office gets a really slow internet", and "when downloading large models, it feels like my home network is being DDoSed." The environment variable `OLLAMA_DOWNLOAD_CONN` can be set to control the number of concurrent connections with a maximum value of 64 (the previous default, an aggressive value - unsafe in some conditions). The new default value is 1, ensuring each Ollama download is given the same priority as other network activities. An entry in the FAQ describes how to use `OLLAMA_DOWNLOAD_CONN` for different use cases. This patch comes with a safe and unproblematic default value. Changes include updates to the `envconfig/config.go`, `cmd/cmd.go`, `server/download.go`, and `docs/faq.md` files.
supercurio · Jul 13, 2024 · a93389f · a93389f
1 parent 1ed0aa8
commit a93389f
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 3 deletions.
diff --git a/cmd/cmd.go b/cmd/cmd.go
@@ -1341,6 +1341,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_MAX_QUEUE"],
 				envVars["OLLAMA_MODELS"],
 				envVars["OLLAMA_NUM_PARALLEL"],
+				envVars["OLLAMA_DOWNLOAD_CONN"],
 				envVars["OLLAMA_NOPRUNE"],
 				envVars["OLLAMA_ORIGINS"],
 				envVars["OLLAMA_TMPDIR"],

diff --git a/docs/faq.md b/docs/faq.md
@@ -272,4 +272,20 @@ The following server settings may be used to adjust how Ollama handles concurren
 - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
 
-Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
+Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
+
+## How do I select the amount of concurrent connections during model downloads?
+
+The Ollama server can download models using multiple concurrent connections. If the default setting doesn't achieve the desired bandwidth utilization, you can increase the environment variable `OLLAMA_DOWNLOAD_CONN` up to a maximum of 64. The default value is 1, ensuring each Ollama download is given the same priority as other network activities.
+
+- For home and office use, the default value of 1 is ideal to prevent network disruptions on the computer running the Ollama server or its local network. A value of 2 is also reasonable in order to increase network utilization if needed.
+- When running the Ollama server on a host that downloads models each time it is initialized, it may be beneficial to increase the `OLLAMA_DOWNLOAD_CONN` variable. The optimal value is one that maximizes utilization up to the link bandwidth during model downloads without significantly increasing overall network latency, which could negatively impact the API request latency for content generation.
+
+The setting can be set either as environement variable:
+```bash
+export OLLAMA_DOWNLOAD_CONN=2
+````
+or when needed by running the server manually
+```bash
+OLLAMA_DOWNLOAD_CONN=2 ollama server
+````
diff --git a/envconfig/config.go b/envconfig/config.go
@@ -53,6 +53,8 @@ var (
 	NoPrune bool
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
+	// Set via OLLAMA_DOWNLOAD_CONN in the environment
+	DownloadConnections int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
 	// Set via OLLAMA_SCHED_SPREAD in the environment
@@ -94,6 +96,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
+		"OLLAMA_DOWNLOAD_CONN":     {"OLLAMA_DOWNLOAD_CONN", DownloadConnections, "Maximum number of concurrent download connections"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
@@ -135,6 +138,7 @@ func init() {
 	MaxRunners = 0  // Autoselect
 	MaxQueuedRequests = 512
 	KeepAlive = 5 * time.Minute
+	DownloadConnections = 1
 
 	LoadConfig()
 }
@@ -215,6 +219,23 @@ func LoadConfig() {
 		}
 	}
 
+	if dlp := clean("OLLAMA_DOWNLOAD_CONN"); dlp != "" {
+		const minDownloadConnections = 1
+		const maxDownloadConnections = 64
+
+		val, err := strconv.Atoi(dlp)
+		if err != nil {
+			slog.Error("invalid setting, ignoring", "OLLAMA_DOWNLOAD_CONN", dlp, "error", err)
+		} else if val < minDownloadConnections {
+			slog.Error("invalid setting, ignoring", "OLLAMA_DOWNLOAD_CONN", dlp, "minimum", minDownloadConnections)
+		} else if val > maxDownloadConnections {
+			slog.Error("invalid setting, correcting", "OLLAMA_DOWNLOAD_CONN", dlp, "maximum", maxDownloadConnections)
+			DownloadConnections = maxDownloadConnections
+		} else {
+			DownloadConnections = val
+		}
+	}
+
 	if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
 		NoHistory = true
 	}

diff --git a/server/download.go b/server/download.go
@@ -22,13 +22,15 @@ import (
 	"golang.org/x/sync/errgroup"
 
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
 
 const maxRetries = 6
 
 var errMaxRetriesExceeded = errors.New("max retries exceeded")
 var errPartStalled = errors.New("part stalled")
+var numDownloadParts = envconfig.DownloadConnections
 
 var blobDownloadManager sync.Map
 
@@ -59,7 +61,6 @@ type blobDownloadPart struct {
 }
 
 const (
-	numDownloadParts          = 64
 	minDownloadPartSize int64 = 100 * format.MegaByte
 	maxDownloadPartSize int64 = 1000 * format.MegaByte
 )
@@ -111,7 +112,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 
 		b.Total, _ = strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
 
-		size := b.Total / numDownloadParts
+		size := b.Total / int64(numDownloadParts)
 		switch {
 		case size < minDownloadPartSize:
 			size = minDownloadPartSize
@@ -211,6 +212,9 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 }
 
 func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error {
+	slog.Debug(fmt.Sprintf("Download chunk part %d, range: %s - %s to file: %s",
+		part.N, format.HumanBytes(part.StartsAt()), format.HumanBytes(part.StopsAt()), part.Name()))
+
 	g, ctx := errgroup.WithContext(ctx)
 	g.Go(func() error {
 		headers := make(http.Header)