--- a/envconfig/config.go +++ b/envconfig/config.go @@ -194,6 +194,8 @@ NoHistory = Bool("OLLAMA_NOHISTORY") // NoPrune disables pruning of model blobs on startup. NoPrune = Bool("OLLAMA_NOPRUNE") + // NoMMap disables memory mapping of the model file. + NoMMap = Bool("OLLAMA_NO_MMAP") // SchedSpread allows scheduling models across all GPUs. SchedSpread = Bool("OLLAMA_SCHED_SPREAD") // MultiUserCache optimizes prompt caching for multi-user scenarios @@ -286,6 +288,7 @@ "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, + "OLLAMA_NO_MMAP": {"OLLAMA_NO_MMAP", NoMMap(), "Disable memory mapping of the model file"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, --- a/llm/server.go +++ b/llm/server.go @@ -630,7 +630,8 @@ // Windows CUDA should not use mmap for best performance // Linux with a model larger than free space, mmap leads to thrashing // For CPU loads we want the memory to be allocated, not FS cache - if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) || + if envconfig.NoMMap() || + (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) || (runtime.GOOS == "linux" && systemInfo.FreeMemory < s.TotalSize() && s.options.UseMMap == nil) || (len(gpus) == 0 && s.options.UseMMap == nil) || (len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||