koboldcpp/koboldcpp.py at 5eec5d6ed9dfabf94dda4fd4705c4015ad0e82e2 · LostRuins/koboldcpp

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

# A hacky little script from Concedo that exposes llama.cpp function bindings

# allowing it to be used via a simulated kobold api endpoint

# generation delay scales linearly with original prompt length.

import ctypes

import os

import argparse

import json, http.server, threading, socket, sys, time

stop_token_max = 10

class load_model_inputs(ctypes.Structure):

_fields_ = [("threads", ctypes.c_int),

("max_context_length", ctypes.c_int),

("batch_size", ctypes.c_int),

("f16_kv", ctypes.c_bool),

("executable_path", ctypes.c_char_p),

("model_filename", ctypes.c_char_p),

("lora_filename", ctypes.c_char_p),

("use_mmap", ctypes.c_bool),

("use_smartcontext", ctypes.c_bool),

("unban_tokens", ctypes.c_bool),

("clblast_info", ctypes.c_int),

("blasbatchsize", ctypes.c_int)]

class generation_inputs(ctypes.Structure):

_fields_ = [("seed", ctypes.c_int),

("prompt", ctypes.c_char_p),

("max_context_length", ctypes.c_int),

("max_length", ctypes.c_int),

("temperature", ctypes.c_float),

("top_k", ctypes.c_int),

("top_p", ctypes.c_float),

("rep_pen", ctypes.c_float),

("rep_pen_range", ctypes.c_int),

("stop_sequence", ctypes.c_char_p * stop_token_max)]

class generation_outputs(ctypes.Structure):

_fields_ = [("status", ctypes.c_int),

("text", ctypes.c_char * 16384)]

handle = None

use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.

use_clblast = False #uses CLBlast instead

use_noavx2 = False #uses openblas with no avx2 instructions

def getdirpath():

return os.path.dirname(os.path.realpath(__file__))

def file_exists(filename):

return os.path.exists(os.path.join(getdirpath(), filename))

def pick_existant_file(ntoption,nonntoption):

ntexist = file_exists(ntoption)

nonntexist = file_exists(nonntoption)

if os.name == 'nt':

if nonntexist and not ntexist:

return nonntoption

return ntoption

else:

if ntexist and not nonntexist:

return ntoption

return nonntoption

lib_default = pick_existant_file("koboldcpp.dll","koboldcpp.so")

lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")

lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")

lib_openblas_noavx2 = pick_existant_file("koboldcpp_openblas_noavx2.dll","koboldcpp_openblas_noavx2.so")

lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")

def init_library():

global handle, use_blas, use_clblast, use_noavx2

libname = ""

if use_noavx2:

if use_blas:

libname = lib_openblas_noavx2

else:

libname = lib_noavx2

else:

if use_clblast:

libname = lib_clblast

elif use_blas:

libname = lib_openblas

else:

libname = lib_default

print("Initializing dynamic library: " + libname)

dir_path = getdirpath()

#OpenBLAS should provide about a 2x speedup on prompt ingestion if compatible.

handle = ctypes.CDLL(os.path.join(dir_path, libname))

handle.load_model.argtypes = [load_model_inputs]

handle.load_model.restype = ctypes.c_bool

handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever

handle.generate.restype = generation_outputs

def load_model(model_filename):

inputs = load_model_inputs()

inputs.model_filename = model_filename.encode("UTF-8")

inputs.lora_filename = args.lora.encode("UTF-8")

inputs.batch_size = 8

inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten

inputs.threads = args.threads

inputs.f16_kv = True

inputs.use_mmap = (not args.nommap)

if args.lora and args.lora!="":

inputs.use_mmap = False

inputs.use_smartcontext = args.smartcontext

inputs.unban_tokens = args.unbantokens

inputs.blasbatchsize = args.blasbatchsize

clblastids = 0

if args.useclblast:

clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])

inputs.clblast_info = clblastids

inputs.executable_path = (getdirpath()+"/").encode("UTF-8")

ret = handle.load_model(inputs)

return ret

def generate(prompt,max_length=20, max_context_length=512,temperature=0.8,top_k=100,top_p=0.85,rep_pen=1.1,rep_pen_range=128,seed=-1,stop_sequence=[]):

inputs = generation_inputs()

outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))

inputs.prompt = prompt.encode("UTF-8")

inputs.max_context_length = max_context_length # this will resize the context buffer if changed

inputs.max_length = max_length

inputs.temperature = temperature

inputs.top_k = top_k

inputs.top_p = top_p

inputs.rep_pen = rep_pen

inputs.rep_pen_range = rep_pen_range

inputs.seed = seed

for n in range(0,stop_token_max):

if n >= len(stop_sequence):

inputs.stop_sequence[n] = "".encode("UTF-8")

else:

inputs.stop_sequence[n] = stop_sequence[n].encode("UTF-8")

ret = handle.generate(inputs,outputs)

if(ret.status==1):

return ret.text.decode("UTF-8","ignore")

return ""

#################################################################

### A hacky simple HTTP server simulating a kobold api by Concedo

### we are intentionally NOT using flask, because we want MINIMAL dependencies

#################################################################

friendlymodelname = "concedo/koboldcpp" # local kobold api apparently needs a hardcoded known HF model name

maxctx = 2048

maxlen = 128

modelbusy = False

defaultport = 5001

KcppVersion = "1.14"

class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):

sys_version = ""

server_version = "ConcedoLlamaForKoboldServer"

def __init__(self, addr, port, embedded_kailite):

self.addr = addr

self.port = port

self.embedded_kailite = embedded_kailite

def __call__(self, *args, **kwargs):

super().__init__(*args, **kwargs)

def do_GET(self):

global maxctx, maxlen, friendlymodelname, KcppVersion

self.path = self.path.rstrip('/')

response_body = None

if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /

if self.embedded_kailite is None:

response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()

else:

response_body = self.embedded_kailite

elif self.path.endswith(('/api/v1/model', '/api/latest/model')):

response_body = (json.dumps({'result': friendlymodelname }).encode())

elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):

response_body = (json.dumps({"value": maxlen}).encode())

elif self.path.endswith(('/api/v1/config/max_context_length', '/api/latest/config/max_context_length')):

response_body = (json.dumps({"value": maxctx}).encode())

elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):

response_body = (json.dumps({"value":""}).encode())

elif self.path.endswith(('/api/v1/config/soft_prompts_list', '/api/latest/config/soft_prompts_list')):

response_body = (json.dumps({"values": []}).encode())

elif self.path.endswith(('/api/v1/info/version', '/api/latest/info/version')):

response_body = (json.dumps({"result":"1.2.2"}).encode())

elif self.path.endswith(('/api/extra/version')):

response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode())

if response_body is None:

self.send_response(404)

self.end_headers()

rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'

self.wfile.write(rp.encode())

else:

self.send_response(200)

self.send_header('Content-Length', str(len(response_body)))

self.end_headers()

self.wfile.write(response_body)

return

def do_POST(self):

global modelbusy

content_length = int(self.headers['Content-Length'])

body = self.rfile.read(content_length)

basic_api_flag = False

kai_api_flag = False

self.path = self.path.rstrip('/')

if modelbusy:

self.send_response(503)

self.end_headers()

self.wfile.write(json.dumps({"detail": {

"msg": "Server is busy; please try again later.",

"type": "service_unavailable",

}}).encode())

return

if self.path.endswith('/request'):

basic_api_flag = True

if self.path.endswith(('/api/v1/generate', '/api/latest/generate')):

kai_api_flag = True

if basic_api_flag or kai_api_flag:

genparams = None

try:

genparams = json.loads(body)

except ValueError as e:

self.send_response(503)

self.end_headers()

return

print("\nInput: " + json.dumps(genparams))

modelbusy = True

if kai_api_flag:

fullprompt = genparams.get('prompt', "")

else:

fullprompt = genparams.get('text', "")

newprompt = fullprompt

recvtxt = ""

res = {}

if kai_api_flag:

recvtxt = generate(

prompt=newprompt,

max_context_length=genparams.get('max_context_length', maxctx),

max_length=genparams.get('max_length', 50),

temperature=genparams.get('temperature', 0.8),

top_k=genparams.get('top_k', 200),

top_p=genparams.get('top_p', 0.85),

rep_pen=genparams.get('rep_pen', 1.1),

rep_pen_range=genparams.get('rep_pen_range', 128),

seed=-1,

stop_sequence=genparams.get('stop_sequence', [])

)

print("\nOutput: " + recvtxt)

res = {"results": [{"text": recvtxt}]}

else:

recvtxt = generate(

prompt=newprompt,

max_length=genparams.get('max', 50),

temperature=genparams.get('temperature', 0.8),

top_k=genparams.get('top_k', 200),

top_p=genparams.get('top_p', 0.85),

rep_pen=genparams.get('rep_pen', 1.1),

rep_pen_range=genparams.get('rep_pen_range', 128),

seed=-1,

stop_sequence=genparams.get('stop_sequence', [])

)

print("\nOutput: " + recvtxt)

res = {"data": {"seqs":[recvtxt]}}

try:

self.send_response(200)

self.end_headers()

self.wfile.write(json.dumps(res).encode())

except:

print("Generate: The response could not be sent, maybe connection was terminated?")

modelbusy = False

return

self.send_response(404)

self.end_headers()

def do_OPTIONS(self):

self.send_response(200)

self.end_headers()

def do_HEAD(self):

self.send_response(200)

self.end_headers()

def end_headers(self):

self.send_header('Access-Control-Allow-Origin', '*')

self.send_header('Access-Control-Allow-Methods', '*')

self.send_header('Access-Control-Allow-Headers', '*')

if "/api" in self.path:

self.send_header('Content-type', 'application/json')

else:

self.send_header('Content-type', 'text/html')

return super(ServerRequestHandler, self).end_headers()

def RunServerMultiThreaded(addr, port, embedded_kailite = None):

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

sock.bind((addr, port))

sock.listen(5)

class Thread(threading.Thread):

def __init__(self, i):

threading.Thread.__init__(self)

self.i = i

self.daemon = True

self.start()

def run(self):

handler = ServerRequestHandler(addr, port, embedded_kailite)

with http.server.HTTPServer((addr, port), handler, False) as self.httpd:

try:

self.httpd.socket = sock

self.httpd.server_bind = self.server_close = lambda self: None

self.httpd.serve_forever()

except (KeyboardInterrupt,SystemExit):

self.httpd.server_close()

sys.exit(0)

finally:

self.httpd.server_close()

sys.exit(0)

def stop(self):

self.httpd.server_close()

numThreads = 6

threadArr = []

for i in range(numThreads):

threadArr.append(Thread(i))

while 1:

try:

time.sleep(10)

except KeyboardInterrupt:

for i in range(numThreads):

threadArr[i].stop()

sys.exit(0)

def main(args):

global use_blas, use_clblast, use_noavx2

global lib_default,lib_noavx2,lib_openblas,lib_openblas_noavx2,lib_clblast

use_blas = False

use_clblast = False

use_noavx2 = False

if args.noavx2:

use_noavx2 = True

if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):

print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")

elif args.noblas:

print("Attempting to use non-avx2 compatibility library without OpenBLAS.")

else:

use_blas = True

print("Attempting to use non-avx2 compatibility library with OpenBLAS. A compatible libopenblas will be required.")

elif args.useclblast:

if not file_exists(lib_clblast) or (os.name=='nt' and not file_exists("clblast.dll")):

print("Warning: CLBlast library file not found. Non-BLAS library will be used.")

else:

print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")

use_clblast = True

else:

if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):

print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")

elif args.noblas:

print("Attempting to library without OpenBLAS.")

else:

use_blas = True

print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")

if args.psutil_set_threads:

import psutil

args.threads = psutil.cpu_count(logical=False)

print("Overriding thread count, using " + str(args.threads) + " threads instead.")

init_library() # Note: if blas does not exist and is enabled, program will crash.

embedded_kailite = None

ggml_selected_file = args.model_param

if not ggml_selected_file:

ggml_selected_file = args.model

if not ggml_selected_file:

#give them a chance to pick a file

print("For command line arguments, please refer to --help")

print("Otherwise, please manually select ggml file:")

try:

from tkinter.filedialog import askopenfilename

import tkinter as tk

root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar

root.attributes("-alpha", 0)

ggml_selected_file = askopenfilename (title="Select ggml model .bin files")

root.destroy()

if not ggml_selected_file:

print("\nNo ggml model file was selected. Exiting.")

time.sleep(2)

sys.exit(2)

except Exception as ex:

print("File selection GUI unsupported. Please check command line: script.py --help")

time.sleep(2)

sys.exit(2)

if not os.path.exists(ggml_selected_file):

print(f"Cannot find model file: {ggml_selected_file}")

time.sleep(2)

sys.exit(2)

if args.lora and args.lora!="":

if not os.path.exists(args.lora):

print(f"Cannot find lora file: {args.lora}")

time.sleep(2)

sys.exit(2)

else:

args.lora = os.path.abspath(args.lora)

modelname = os.path.abspath(ggml_selected_file)

print(f"Loading model: {modelname} \n[Threads: {args.threads}, SmartContext: {args.smartcontext}]")

loadok = load_model(modelname)

print("Load Model OK: " + str(loadok))

if not loadok:

print("Could not load model: " + modelname)

time.sleep(2)

sys.exit(3)

try:

basepath = os.path.abspath(os.path.dirname(__file__))

with open(os.path.join(basepath, "klite.embd"), mode='rb') as f:

embedded_kailite = f.read()

print("Embedded Kobold Lite loaded.")

except:

print("Could not find Kobold Lite. Embedded Kobold Lite will not be available.")

if args.port_param!=defaultport:

args.port = args.port_param

print(f"Starting Kobold HTTP Server on port {args.port}")

epurl = ""

if args.host=="":

epurl = f"http://localhost:{args.port}" + ("?streaming=1" if args.stream else "")

else:

epurl = f"http://{args.host}:{args.port}" + ("?streaming=1" if args.stream else "")

if args.launch:

try:

import webbrowser as wb

wb.open(epurl)

except:

print("--launch was set, but could not launch web browser automatically.")

print(f"Please connect to custom endpoint at {epurl}")

RunServerMultiThreaded(args.host, args.port, embedded_kailite)

if __name__ == '__main__':

print("Welcome to KoboldCpp - Version " + KcppVersion) # just update version manually

parser = argparse.ArgumentParser(description='Kobold llama.cpp server')

modelgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args

modelgroup.add_argument("--model", help="Model file to load", nargs="?")

modelgroup.add_argument("model_param", help="Model file to load (positional)", nargs="?")

portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args

portgroup.add_argument("--port", help="Port to listen on", default=defaultport, type=int, action='store')

portgroup.add_argument("port_param", help="Port to listen on (positional)", default=defaultport, nargs="?", type=int, action='store')

parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="")

parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true')

parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", default="")

#os.environ["OMP_NUM_THREADS"] = '12'

# psutil.cpu_count(logical=False)

physical_core_limit = 1

if os.cpu_count()!=None and os.cpu_count()>1:

physical_core_limit = int(os.cpu_count()/2)

default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))

parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)

parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')

parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512)

parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')

parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')

parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')

parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')

parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')

compatgroup = parser.add_mutually_exclusive_group()

compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')

compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)

args = parser.parse_args()

main(args)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

koboldcpp.py

koboldcpp.py

Files

koboldcpp.py

Latest commit

History

koboldcpp.py

File metadata and controls