From d3c0d28e605dc302562146406eef71e41c4cbd68 Mon Sep 17 00:00:00 2001
From: kpamnany <kiran.pamnany@intel.com>
Date: Tue, 27 Jun 2017 17:53:49 +0530
Subject: [PATCH 1/4] threading: Integrating partr (almost done!)

Added partr code. Abstracted interface to threading infrastructure.
---
 Make.inc                                    |   11 +
 base/Makefile                               |    5 +
 base/boot.jl                                |   12 +
 base/channels.jl                            |  200 +++-
 base/event.jl                               |  255 ++--
 base/stream.jl                              |    8 +-
 base/summarysize.jl                         |   12 +
 base/task.jl                                |   53 +-
 base/threadingconstructs.jl                 |    6 +-
 contrib/julia-config.jl                     |    3 +
 doc/src/manual/faq.md                       |    3 +-
 src/Makefile                                |    4 +-
 src/atomics.h                               |    6 +
 src/builtins.c                              |    3 +
 src/dump.c                                  |    8 +-
 src/forkjoin-ti.c                           |  356 ++++++
 src/forkjoin-ti.h                           |   21 +
 src/gc-debug.c                              |   10 +-
 src/gc.c                                    |   24 +-
 src/init.c                                  |    5 +-
 src/julia.h                                 |  111 +-
 src/julia_internal.h                        |   17 +-
 src/julia_threads.h                         |    6 +-
 src/locks.h                                 |   16 +
 src/options.h                               |    6 +
 src/partr.c                                 | 1154 +++++++++++++++++++
 src/partr.h                                 |   46 +
 src/staticdata.c                            |    3 +
 src/task.c                                  |  169 ++-
 src/threadgroup.c                           |  206 ----
 src/threadgroup.h                           |   44 -
 src/threading.c                             |  376 +-----
 src/threading.h                             |   70 +-
 stdlib/Distributed/test/distributed_exec.jl |    4 +-
 stdlib/FileWatching/src/FileWatching.jl     |    6 +-
 stdlib/FileWatching/test/runtests.jl        |    6 +-
 stdlib/Sockets/src/addrinfo.jl              |    8 +-
 stdlib/Sockets/test/runtests.jl             |    6 +-
 test/channels.jl                            |   40 +-
 test/file.jl                                |    2 +-
 test/misc.jl                                |    1 +
 test/read.jl                                |    3 +-
 test/spawn.jl                               |    6 +-
 test/threads.jl                             |    2 +-
 44 files changed, 2459 insertions(+), 854 deletions(-)
 create mode 100644 src/forkjoin-ti.c
 create mode 100644 src/forkjoin-ti.h
 create mode 100644 src/partr.c
 create mode 100644 src/partr.h
 delete mode 100644 src/threadgroup.c
 delete mode 100644 src/threadgroup.h

diff --git a/Make.inc b/Make.inc
index 81ff923156ed8..19e6265b65b3e 100644
--- a/Make.inc
+++ b/Make.inc
@@ -68,6 +68,12 @@ USEIFC  ?= 0
 # Enable threading with one thread
 JULIA_THREADS := 1
 
+# Enable the parallel task runtime
+JULIA_PARTR ?= 0
+ifeq ($(JULIA_THREADS), 0)
+JULIA_PARTR := 0
+endif
+
 ifeq ($(USE_MKL), 1)
 $(warning "The julia make variable USE_MKL has been renamed to USE_INTEL_MKL")
 USE_INTEL_MKL := 1
@@ -1060,6 +1066,11 @@ ifneq ($(JULIA_THREADS), 0)
 JCPPFLAGS += -DJULIA_ENABLE_THREADING -DJULIA_NUM_THREADS=$(JULIA_THREADS)
 endif
 
+# Parallel task runtime
+ifneq ($(JULIA_PARTR), 0)
+JCPPFLAGS += -DJULIA_ENABLE_PARTR
+endif
+
 # Intel VTune Amplifier
 ifeq ($(USE_INTEL_JITEVENTS), 1)
 JCPPFLAGS += -DJL_USE_INTEL_JITEVENTS
diff --git a/base/Makefile b/base/Makefile
index b569ed5227ffc..2d0612368c17c 100644
--- a/base/Makefile
+++ b/base/Makefile
@@ -71,6 +71,11 @@ else
 	@echo "const PRIVATE_LIBDIR = \"$(private_libdir_rel)\"" >> $@
 	@echo "const INCLUDEDIR = \"$(includedir_rel)\"" >> $@
 endif
+ifneq ($(JULIA_PARTR), 0)
+	@echo "const JULIA_PARTR = true" >> $@
+else
+	@echo "const JULIA_PARTR = false" >> $@
+endif
 
 	@# This to ensure that we always rebuild this file, but only when it is modified do we touch build_h.jl,
 	@# ensuring we rebuild the system image as infrequently as possible
diff --git a/base/boot.jl b/base/boot.jl
index 03b389b0f7e1f..67ad236105eaf 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -128,6 +128,17 @@
 #    name::Symbol
 #end
 
+#if JULIA_PARTR
+#mutable struct Task
+#    storage::Any
+#    state::Symbol
+#    result::Any
+#    exception::Any
+#    backtrace::Any
+#    logstate::Any
+#    code::Any
+#end
+#else
 #mutable struct Task
 #    parent::Task
 #    storage::Any
@@ -139,6 +150,7 @@
 #    logstate::Any
 #    code::Any
 #end
+#end
 
 export
     # key types
diff --git a/base/channels.jl b/base/channels.jl
index 090fad3ad877f..6fc56ded3ee64 100644
--- a/base/channels.jl
+++ b/base/channels.jl
@@ -2,6 +2,10 @@
 
 abstract type AbstractChannel{T} end
 
+if JULIA_PARTR
+
+using Base.Threads
+
 """
     Channel{T}(sz::Int)
 
@@ -21,7 +25,54 @@ mutable struct Channel{T} <: AbstractChannel{T}
     cond_take::Condition                 # waiting for data to become available
     cond_put::Condition                  # waiting for a writeable slot
     state::Symbol
-    excp::Union{Exception, Nothing}         # exception to be thrown when state != :open
+    excp::Union{Exception,Nothing}       # exception to be thrown when state != :open
+
+    data::Vector{T}
+    sz_max::Int                          # maximum size of channel
+    lock::SpinLock
+
+    # The following fields synchronize tasks that use unbuffered channels
+    # (sz_max == 0).
+    nwaiters::Atomic{Int}
+    takers::Vector{Task}
+    putters::Vector{Task}
+
+    function Channel{T}(sz::Float64) where T
+        Channel{T}(sz == Inf ? typemax(Int) : convert(Int, sz))
+    end
+    function Channel{T}(sz::Integer) where T
+        sz < 0 && throw(ArgumentError("Channel size must be 0, a positive integer, or Inf"))
+        ch = new(Condition(), Condition(), :open, nothing, Vector{T}(), sz, SpinLock(), Atomic())
+        if sz == 0
+            ch.takers = Vector{Task}()
+            ch.putters = Vector{Task}()
+        end
+        return ch
+    end
+end
+
+else # !JULIA_PARTR
+
+"""
+    Channel{T}(sz::Int)
+
+Constructs a `Channel` with an internal buffer that can hold a maximum of `sz` objects
+of type `T`.
+[`put!`](@ref) calls on a full channel block until an object is removed with [`take!`](@ref).
+
+`Channel(0)` constructs an unbuffered channel. `put!` blocks until a matching `take!` is called.
+And vice-versa.
+
+Other constructors:
+
+* `Channel(Inf)`: equivalent to `Channel{Any}(typemax(Int))`
+* `Channel(sz)`: equivalent to `Channel{Any}(sz)`
+"""
+mutable struct Channel{T} <: AbstractChannel{T}
+    cond_take::Condition                 # waiting for data to become available
+    cond_put::Condition                  # waiting for a writeable slot
+    state::Symbol
+    excp::Union{Exception, Nothing}      # exception to be thrown when state != :open
 
     data::Vector{T}
     sz_max::Int                          # maximum size of channel
@@ -51,6 +102,8 @@ mutable struct Channel{T} <: AbstractChannel{T}
     end
 end
 
+end # !JULIA_PARTR
+
 Channel(sz) = Channel{Any}(sz)
 
 # special constructors
@@ -88,13 +141,13 @@ Referencing the created task:
 ```jldoctest
 julia> taskref = Ref{Task}();
 
-julia> chnl = Channel(c->(@show take!(c)); taskref=taskref);
+julia> chnl = Channel(c->println(take!(c)); taskref=taskref);
 
 julia> istaskdone(taskref[])
 false
 
 julia> put!(chnl, "Hello");
-take!(c) = "Hello"
+Hello
 
 julia> istaskdone(taskref[])
 true
@@ -110,7 +163,6 @@ function Channel(func::Function; ctype=Any, csize=0, taskref=nothing)
     return chnl
 end
 
-
 closed_exception() = InvalidStateException("Channel is closed.", :closed)
 
 isbuffered(c::Channel) = c.sz_max==0 ? false : true
@@ -121,6 +173,7 @@ function check_channel_state(c::Channel)
         throw(closed_exception())
     end
 end
+
 """
     close(c::Channel)
 
@@ -255,6 +308,25 @@ function put!(c::Channel{T}, v) where T
     isbuffered(c) ? put_buffered(c,v) : put_unbuffered(c,v)
 end
 
+if JULIA_PARTR
+
+function put_buffered(c::Channel, v)
+    while true
+        lock(c.lock)
+        if length(c.data) == c.sz_max
+            unlock(c.lock)
+            wait(c.cond_put)
+        else
+            push!(c.data, v)
+            notify(c.cond_take, nothing, true, false)
+            unlock(c.lock)
+            return v
+        end
+    end
+end
+
+else # !JULIA_PARTR
+
 function put_buffered(c::Channel, v)
     while length(c.data) == c.sz_max
         wait(c.cond_put)
@@ -266,6 +338,28 @@ function put_buffered(c::Channel, v)
     v
 end
 
+end # !JULIA_PARTR
+
+if JULIA_PARTR
+
+function put_unbuffered(c::Channel, v)
+    while true
+        lock(c.lock)
+        if length(c.takers) > 0
+            taker = popfirst!(c.takers)
+            unlock(c.lock)
+            yield(taker, v)
+            return v
+        else
+            unlock(c.lock)
+            c.nwaiters[] > 0 && notify(c.cond_take, nothing, false, false)
+            wait(c.cond_put)
+        end
+    end
+end
+
+else # !JULIA_PARTR
+
 function put_unbuffered(c::Channel, v)
     if length(c.takers) == 0
         push!(c.putters, current_task())
@@ -283,8 +377,37 @@ function put_unbuffered(c::Channel, v)
     return v
 end
 
+end # !JULIA_PARTR
+
 push!(c::Channel, v) = put!(c, v)
 
+if JULIA_PARTR
+
+"""
+    fetch(c::Channel)
+
+Wait for and get the first available item from the channel. Does not
+remove the item. `fetch` is unsupported on an unbuffered (0-size) channel.
+"""
+function fetch(c::Channel)
+    c.sz_max == 0 && throw(ErrorException("`fetch` is not supported on an unbuffered Channel"))
+    while true
+        check_channel_state(c)
+        lock(c.lock)
+        if length(c.data) < 1
+            unlock(c.lock)
+            # TODO: fix the race here
+            wait(c.cond_take)
+        else
+            v = c.data[1]
+            unlock(c.lock)
+            return v
+        end
+    end
+end
+
+else # !JULIA_PARTR
+
 """
     fetch(c::Channel)
 
@@ -298,6 +421,7 @@ function fetch_buffered(c::Channel)
 end
 fetch_unbuffered(c::Channel) = throw(ErrorException("`fetch` is not supported on an unbuffered Channel."))
 
+end # !JULIA_PARTR
 
 """
     take!(c::Channel)
@@ -308,6 +432,26 @@ For unbuffered channels, blocks until a [`put!`](@ref) is performed by a differe
 task.
 """
 take!(c::Channel) = isbuffered(c) ? take_buffered(c) : take_unbuffered(c)
+
+if JULIA_PARTR
+
+function take_buffered(c::Channel)
+    while true
+        lock(c.lock)
+        if length(c.data) > 0
+            v = popfirst!(c.data)
+            unlock(c.lock)
+            notify(c.cond_put, nothing, false, false)
+            return v
+        end
+        unlock(c.lock)
+        check_channel_state(c)
+        wait(c.cond_take)
+    end
+end
+
+else # !JULIA_PARTR
+
 function take_buffered(c::Channel)
     wait(c)
     v = popfirst!(c.data)
@@ -315,7 +459,29 @@ function take_buffered(c::Channel)
     v
 end
 
-popfirst!(c::Channel) = take!(c)
+end # !JULIA_PARTR
+
+if JULIA_PARTR
+
+function take_unbuffered(c::Channel{T}) where T
+    check_channel_state(c)
+    lock(c.lock)
+    push!(c.takers, current_task())
+    unlock(c.lock)
+    notify(c.cond_put, nothing, false, false)
+    try
+        # We wait here for a putter which will reschedule us with the
+        # value it is putting (which is returned by this wait call).
+        return wait()::T
+    catch ex
+        lock(c.lock)
+        filter!(x->x!=current_task(), c.takers)
+        unlock(c.lock)
+        rethrow(ex)
+    end
+end
+
+else # !JULIA_PARTR
 
 # 0-size channel
 function take_unbuffered(c::Channel{T}) where T
@@ -338,6 +504,10 @@ function take_unbuffered(c::Channel{T}) where T
     end
 end
 
+end # !JULIA_PARTR
+
+popfirst!(c::Channel) = take!(c)
+
 """
     isready(c::Channel)
 
@@ -348,7 +518,14 @@ For unbuffered channels returns `true` if there are tasks waiting
 on a [`put!`](@ref).
 """
 isready(c::Channel) = n_avail(c) > 0
+
+if JULIA_PARTR
+n_avail(c::Channel) = lock(c.lock) do
+    isbuffered(c) ? length(c.data) : isempty(c.cond_put) ? 0 : 1
+end
+else # !JULIA_PARTR
 n_avail(c::Channel) = isbuffered(c) ? length(c.data) : length(c.putters)
+end # !JULIA_PARTR
 
 wait(c::Channel) = isbuffered(c) ? wait_impl(c) : wait_unbuffered(c)
 function wait_impl(c::Channel)
@@ -359,6 +536,17 @@ function wait_impl(c::Channel)
     nothing
 end
 
+if JULIA_PARTR
+function wait_unbuffered(c::Channel)
+    atomic_add!(c.nwaiters, 1)
+    try
+        wait_impl(c)
+    finally
+        atomic_sub!(c.nwaiters, 1)
+    end
+    nothing
+end
+else # !JULIA_PARTR
 function wait_unbuffered(c::Channel)
     c.waiters += 1
     try
@@ -368,6 +556,7 @@ function wait_unbuffered(c::Channel)
     end
     nothing
 end
+end # !JULIA_PARTR
 
 function notify_error(c::Channel, err)
     notify_error(c.cond_take, err)
@@ -379,6 +568,7 @@ function notify_error(c::Channel, err)
         foreach(t->schedule(t, err; error=true), waiters)
     end
 end
+
 notify_error(c::Channel) = notify_error(c, c.excp)
 
 eltype(::Type{Channel{T}}) where {T} = T
diff --git a/base/event.jl b/base/event.jl
index cf5e93cc25934..c0a28b23edbce 100644
--- a/base/event.jl
+++ b/base/event.jl
@@ -2,41 +2,45 @@
 
 ## condition variables
 
-"""
-    Condition()
 
-Create an edge-triggered event source that tasks can wait for. Tasks that call [`wait`](@ref) on a
-`Condition` are suspended and queued. Tasks are woken up when [`notify`](@ref) is later called on
-the `Condition`. Edge triggering means that only tasks waiting at the time [`notify`](@ref) is
-called can be woken up. For level-triggered notifications, you must keep extra state to keep
-track of whether a notification has happened. The [`Channel`](@ref) type does
-this, and so can be used for level-triggered events.
-"""
-mutable struct Condition
-    waitq::Vector{Any}
+if JULIA_PARTR
 
-    Condition() = new([])
-end
+import Core.Condition
 
-"""
-    wait([x])
+Condition() = ccall(:jl_condition_new, Ref{Condition}, ())
 
-Block the current task until some event occurs, depending on the type of the argument:
+wait(c::Condition) = ccall(:jl_task_wait, Any, (Ref{Condition},), c)
 
-* [`Channel`](@ref): Wait for a value to be appended to the channel.
-* [`Condition`](@ref): Wait for [`notify`](@ref) on a condition.
-* `Process`: Wait for a process or process chain to exit. The `exitcode` field of a process
-  can be used to determine success or failure.
-* [`Task`](@ref): Wait for a `Task` to finish. If the task fails with an exception, the
-  exception is propagated (re-thrown in the task that called `wait`).
-* [`RawFD`](@ref): Wait for changes on a file descriptor (see the `FileWatching` package).
+notify(c::Condition, arg, all, error) = ccall(:jl_task_notify, Cvoid, (Ref{Condition},Any,Int8,Int8), c, arg, all, error)
+notify(c::Condition, @nospecialize(arg = nothing); all=true, error=false) = notify(c, arg, all, error)
+notify_error(c::Condition, err) = notify(c, err, true, true)
 
-If no argument is passed, the task blocks for an undefined period. A task can only be
-restarted by an explicit call to [`schedule`](@ref) or [`yieldto`](@ref).
+isempty(c::Condition) = ccall(:jl_condition_isempty, Cint, (Ref{Condition},), c) == 1
+
+schedule(t::Task, @nospecialize(arg = nothing); error=false) =
+    ccall(:jl_task_spawn, Ref{Task}, (Ref{Task},Any,Int8,Int8,Int8),
+          t, arg, error, true, true)
+
+fetch(t::Task) = ccall(:jl_task_sync, Any, (Ref{Task},), t)
+
+yield() = ccall(:jl_task_yield, Any, (Cint,), 1)
+yield(t::Task, @nospecialize x = nothing) = (schedule(t, x); yield())
+yieldto(t::Task, @nospecialize x = nothing) = (schedule(t, x); wait())
+try_yieldto(undo, reftask::Ref{Task}) = (schedule(reftask[]); wait())
+throwto(t::Task, @nospecialize exc) = (schedule(t, exc, error=true); wait())
+
+wait() = ccall(:jl_task_yield, Any, (Cint,), 0)
+
+
+else # !JULIA_PARTR
+
+
+mutable struct Condition
+    waitq::Vector{Any}
+
+    Condition() = new([])
+end
 
-Often `wait` is called within a `while` loop to ensure a waited-for condition is met before
-proceeding.
-"""
 function wait(c::Condition)
     ct = current_task()
 
@@ -50,15 +54,6 @@ function wait(c::Condition)
     end
 end
 
-"""
-    notify(condition, val=nothing; all=true, error=false)
-
-Wake up tasks waiting for a condition, passing them `val`. If `all` is `true` (the default),
-all waiting tasks are woken, otherwise only one is. If `error` is `true`, the passed value
-is raised as an exception in the woken tasks.
-
-Return the count of tasks woken up. Return 0 if no tasks are waiting on `condition`.
-"""
 notify(c::Condition, @nospecialize(arg = nothing); all=true, error=false) = notify(c, arg, all, error)
 function notify(c::Condition, arg, all, error)
     cnt = 0
@@ -78,6 +73,8 @@ end
 
 notify_error(c::Condition, err) = notify(c, err, true, true)
 
+isempty(c::Condition) = isempty(c.waitq)
+
 n_waiters(c::Condition) = length(c.waitq)
 
 ## scheduler and work queue
@@ -94,36 +91,6 @@ end
 
 schedule(t::Task) = enq_work(t)
 
-"""
-    schedule(t::Task, [val]; error=false)
-
-Add a [`Task`](@ref) to the scheduler's queue. This causes the task to run constantly when the system
-is otherwise idle, unless the task performs a blocking operation such as [`wait`](@ref).
-
-If a second argument `val` is provided, it will be passed to the task (via the return value of
-[`yieldto`](@ref)) when it runs again. If `error` is `true`, the value is raised as an exception in
-the woken task.
-
-# Examples
-```jldoctest
-julia> a5() = sum(i for i in 1:1000);
-
-julia> b = Task(a5);
-
-julia> istaskstarted(b)
-false
-
-julia> schedule(b);
-
-julia> yield();
-
-julia> istaskstarted(b)
-true
-
-julia> istaskdone(b)
-true
-```
-"""
 function schedule(t::Task, arg; error=false)
     # schedule a task to be (re)started with the given value or exception
     if error
@@ -134,34 +101,8 @@ function schedule(t::Task, arg; error=false)
     return enq_work(t)
 end
 
-# fast version of `schedule(t, arg); wait()`
-function schedule_and_wait(t::Task, arg=nothing)
-    t.state == :runnable || error("schedule: Task not runnable")
-    if isempty(Workqueue)
-        return yieldto(t, arg)
-    else
-        t.result = arg
-        push!(Workqueue, t)
-        t.state = :queued
-    end
-    return wait()
-end
-
-"""
-    yield()
-
-Switch to the scheduler to allow another scheduled task to run. A task that calls this
-function is still runnable, and will be restarted immediately if there are no other runnable
-tasks.
-"""
 yield() = (enq_work(current_task()); wait())
 
-"""
-    yield(t::Task, arg = nothing)
-
-A fast, unfair-scheduling version of `schedule(t, arg); yield()` which
-immediately yields to `t` before calling the scheduler.
-"""
 function yield(t::Task, @nospecialize x = nothing)
     t.state == :runnable || error("schedule: Task not runnable")
     t.result = x
@@ -169,14 +110,6 @@ function yield(t::Task, @nospecialize x = nothing)
     return try_yieldto(ensure_rescheduled, Ref(t))
 end
 
-"""
-    yieldto(t::Task, arg = nothing)
-
-Switch to the given task. The first time a task is switched to, the task's function is
-called with no arguments. On subsequent switches, `arg` is returned from the task's last
-call to `yieldto`. This is a low-level call that only switches tasks, not considering states
-or scheduling in any way. Its use is discouraged.
-"""
 function yieldto(t::Task, @nospecialize x = nothing)
     t.result = x
     return try_yieldto(identity, Ref(t))
@@ -262,6 +195,126 @@ function wait()
     # unreachable
 end
 
+end # JULIA_PARTR
+
+"""
+    isempty(condition)
+
+Return `true` if no tasks are waiting on the condition, `false` otherwise.
+"""
+isempty(c::Condition)
+
+"""
+    Condition()
+
+Create an edge-triggered event source that tasks can wait for. Tasks that call [`wait`](@ref) on a
+`Condition` are suspended and queued. Tasks are woken up when [`notify`](@ref) is later called on
+the `Condition`. Edge triggering means that only tasks waiting at the time [`notify`](@ref) is
+called can be woken up. For level-triggered notifications, you must keep extra state to keep
+track of whether a notification has happened. The [`Channel`](@ref) type does
+this, and so can be used for level-triggered events.
+"""
+Condition
+
+"""
+    wait([x])
+
+Block the current task until some event occurs, depending on the type of the argument:
+
+* [`Channel`](@ref): Wait for a value to be appended to the channel.
+* [`Condition`](@ref): Wait for [`notify`](@ref) on a condition.
+* `Process`: Wait for a process or process chain to exit. The `exitcode` field of a process
+  can be used to determine success or failure.
+* [`Task`](@ref): Wait for a `Task` to finish. If the task fails with an exception, the
+  exception is propagated (re-thrown in the task that called `wait`).
+* [`RawFD`](@ref): Wait for changes on a file descriptor (see the `FileWatching` package).
+
+If no argument is passed, the task blocks for an undefined period. A task can only be
+restarted by an explicit call to [`schedule`](@ref) or [`yieldto`](@ref).
+
+Often `wait` is called within a `while` loop to ensure a waited-for condition is met before
+proceeding.
+"""
+wait
+
+"""
+    notify(condition, val=nothing; all=true, error=false)
+
+Wake up tasks waiting for a condition, passing them `val`. If `all` is `true` (the default),
+all waiting tasks are woken, otherwise only one is. If `error` is `true`, the passed value
+is raised as an exception in the woken tasks.
+
+Return the count of tasks woken up. Return 0 if no tasks are waiting on `condition`.
+"""
+notify
+
+"""
+    fetch(t::Task)
+
+Wait for a Task to finish, then return its result value. If the task fails with an
+exception, the exception is propagated (re-thrown in the task that called fetch).
+"""
+fetch(t::Task)
+
+"""
+    yield()
+
+Switch to the scheduler to allow another scheduled task to run. A task that calls this
+function is still runnable, and will be restarted immediately if there are no other runnable
+tasks.
+"""
+yield
+
+"""
+    yield(t::Task, arg = nothing)
+
+A fast, unfair-scheduling version of `schedule(t, arg); yield()` which
+immediately yields to `t` before calling the scheduler.
+"""
+yield(t::Task)
+
+"""
+    yieldto(t::Task, arg = nothing)
+
+Switch to the given task. The first time a task is switched to, the task's function is
+called with no arguments. On subsequent switches, `arg` is returned from the task's last
+call to `yieldto`. This is a low-level call that only switches tasks, not considering states
+or scheduling in any way. Its use is discouraged.
+"""
+yieldto
+
+"""
+    schedule(t::Task, [val]; error=false)
+
+Add a [`Task`](@ref) to the scheduler's queue. This causes the task to run constantly when the system
+is otherwise idle, unless the task performs a blocking operation such as [`wait`](@ref).
+
+If a second argument `val` is provided, it will be passed to the task (via the return value of
+[`yieldto`](@ref)) when it runs again. If `error` is `true`, the value is raised as an exception in
+the woken task.
+
+# Examples
+```jldoctest
+julia> a5() = sum(i for i in 1:1000);
+
+julia> b = Task(a5);
+
+julia> istaskstarted(b)
+false
+
+julia> schedule(b);
+
+julia> yield();
+
+julia> istaskstarted(b)
+true
+
+julia> istaskdone(b)
+true
+```
+"""
+schedule
+
 if Sys.iswindows()
     pause() = ccall(:Sleep, stdcall, Cvoid, (UInt32,), 0xffffffff)
 else
diff --git a/base/stream.jl b/base/stream.jl
index 3d54865c89732..bee3e67b1b0ae 100644
--- a/base/stream.jl
+++ b/base/stream.jl
@@ -274,7 +274,7 @@ function wait_readbyte(x::LibuvStream, c::UInt8)
             wait(x.readnotify)
         end
     finally
-        if isempty(x.readnotify.waitq)
+        if isempty(x.readnotify)
             stop_reading(x) # stop reading iff there are currently no other read clients of the stream
         end
         unpreserve_handle(x)
@@ -297,7 +297,7 @@ function wait_readnb(x::LibuvStream, nb::Int)
             wait(x.readnotify)
         end
     finally
-        if isempty(x.readnotify.waitq)
+        if isempty(x.readnotify)
             stop_reading(x) # stop reading iff there are currently no other read clients of the stream
         end
         if oldthrottle <= x.throttle <= nb
@@ -703,7 +703,7 @@ function readbytes!(s::LibuvStream, a::Vector{UInt8}, nb::Int)
             return bytesavailable(newbuf)
         finally
             s.buffer = sbuf
-            if !isempty(s.readnotify.waitq)
+            if !isempty(s.readnotify)
                 start_reading(s) # resume reading iff there are currently other read clients of the stream
             end
         end
@@ -739,7 +739,7 @@ function unsafe_read(s::LibuvStream, p::Ptr{UInt8}, nb::UInt)
             nb == bytesavailable(newbuf) || throw(EOFError())
         finally
             s.buffer = sbuf
-            if !isempty(s.readnotify.waitq)
+            if !isempty(s.readnotify)
                 start_reading(s) # resume reading iff there are currently other read clients of the stream
             end
         end
diff --git a/base/summarysize.jl b/base/summarysize.jl
index a2974b967ce3e..7b54150546786 100644
--- a/base/summarysize.jl
+++ b/base/summarysize.jl
@@ -147,6 +147,16 @@ function (ss::SummarySize)(obj::Module)
     return size
 end
 
+if JULIA_PARTR
+
+function (ss::SummarySize)(obj::Task)
+    haskey(ss.seen, obj) ? (return 0) : (ss.seen[obj] = true)
+    size::Int = Core.sizeof(obj)
+    return size
+end
+
+else
+
 function (ss::SummarySize)(obj::Task)
     haskey(ss.seen, obj) ? (return 0) : (ss.seen[obj] = true)
     size::Int = Core.sizeof(obj)
@@ -161,3 +171,5 @@ function (ss::SummarySize)(obj::Task)
     # TODO: add stack size, and possibly traverse stack roots
     return size
 end
+
+end
diff --git a/base/task.jl b/base/task.jl
index 4045cde09ffa1..0a62394945a93 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -177,6 +177,15 @@ function task_local_storage(body::Function, key, val)
     end
 end
 
+if JULIA_PARTR
+
+function wait(t::Task)
+    fetch(t)
+    return nothing
+end
+
+else # !JULIA_PARTR
+
 # NOTE: you can only wait for scheduled tasks
 function wait(t::Task)
     if !istaskdone(t)
@@ -192,17 +201,12 @@ function wait(t::Task)
     end
 end
 
-"""
-    fetch(t::Task)
-
-Wait for a Task to finish, then return its result value. If the task fails with an
-exception, the exception is propagated (re-thrown in the task that called fetch).
-"""
 function fetch(t::Task)
     wait(t)
     task_result(t)
 end
 
+end # !JULIA_PARTR
 
 ## lexically-scoped waiting for multiple items
 
@@ -248,8 +252,6 @@ macro sync(block)
     end
 end
 
-# schedule an expression to run asynchronously
-
 """
     @async
 
@@ -274,6 +276,39 @@ function register_taskdone_hook(t::Task, hook)
     t
 end
 
+if JULIA_PARTR
+
+# runtime system hook called when a task finishes
+function task_done_hook(t::Task)
+    # `finish_task` sets `sigatomic` before entering this function
+    err = istaskfailed(t)
+    result = task_result(t)
+    handled = false
+    if err
+        t.backtrace = catch_backtrace()
+    end
+
+    # Execute any other hooks registered in the TLS
+    if isa(t.storage, IdDict) && haskey(t.storage, :TASKDONE_HOOKS)
+        foreach(hook -> hook(t), t.storage[:TASKDONE_HOOKS])
+        delete!(t.storage, :TASKDONE_HOOKS)
+        handled = true
+    end
+
+    if err && !handled
+        if isa(result,InterruptException) && isdefined(Base,:active_repl_backend) &&
+            active_repl_backend.backend_task.state == :runnable &&
+            #isempty(Workqueue) &&  # TODO
+            active_repl_backend.in_eval
+            throwto(active_repl_backend.backend_task, result) # this terminates the task
+        end
+    end
+    # Clear sigatomic before waiting
+    sigatomic_end()
+end
+
+else # !JULIA_PARTR
+
 # runtime system hook called when a task finishes
 function task_done_hook(t::Task)
     # `finish_task` sets `sigatomic` before entering this function
@@ -321,6 +356,8 @@ function task_done_hook(t::Task)
     end
 end
 
+end # !JULIA_PARTR
+
 """
     timedwait(testcb::Function, secs::Float64; pollint::Float64=0.1)
 
diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl
index 61a1f598546a6..ebe11096391ce 100644
--- a/base/threadingconstructs.jl
+++ b/base/threadingconstructs.jl
@@ -96,7 +96,11 @@ macro threads(args...)
         throw(ArgumentError("need an expression argument to @threads"))
     end
     if ex.head === :for
-        return _threadsfor(ex.args[1],ex.args[2])
+        if Base.JULIA_PARTR
+            return esc(ex)
+        else
+            return _threadsfor(ex.args[1],ex.args[2])
+        end
     else
         throw(ArgumentError("unrecognized argument to @threads"))
     end
diff --git a/contrib/julia-config.jl b/contrib/julia-config.jl
index 8ac742fade6c1..db5a5f0c28de6 100755
--- a/contrib/julia-config.jl
+++ b/contrib/julia-config.jl
@@ -62,6 +62,9 @@ function cflags()
     if threadingOn()
         print(flags, " -DJULIA_ENABLE_THREADING=1")
     end
+    if Base.JULIA_PARTR
+        print(flags, " -DJULIA_ENABLE_PARTR")
+    end
     if Sys.isunix()
         print(flags, " -fPIC")
     end
diff --git a/doc/src/manual/faq.md b/doc/src/manual/faq.md
index e6ae749fd1b5a..4db755c652d89 100644
--- a/doc/src/manual/faq.md
+++ b/doc/src/manual/faq.md
@@ -774,8 +774,7 @@ julia> @sync for i in 1:3
 You can lock your writes with a `ReentrantLock` like this:
 
 ```jldoctest
-julia> l = ReentrantLock()
-ReentrantLock(nothing, Condition(Any[]), 0)
+julia> l = ReentrantLock();
 
 julia> @sync for i in 1:3
            @async begin
diff --git a/src/Makefile b/src/Makefile
index d8e1c6a49b115..cc8b459b7abb4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -42,7 +42,7 @@ SRCS := \
 	jltypes gf typemap ast builtins module interpreter symbol \
 	dlload sys init task array dump staticdata toplevel jl_uv datatype \
 	simplevector APInt-C runtime_intrinsics runtime_ccall precompile \
-	threadgroup threading stackwalk gc gc-debug gc-pages gc-stacks method \
+	threading forkjoin-ti partr stackwalk gc gc-debug gc-pages gc-stacks method \
 	jlapi signal-handling safepoint jloptions timing subtype rtutils \
 	crc32c processor
 
@@ -203,7 +203,7 @@ $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c)
 $(BUILDDIR)/dump.o $(BUILDDIR)/dump.dbg.obj: $(addprefix $(SRCDIR)/,common_symbols1.inc common_symbols2.inc)
-$(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h threadgroup.h)
+$(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h)
 $(addprefix $(BUILDDIR)/,APInt-C.o APInt-C.dbg.obj runtime_intrinsics.o runtime_intrinsics.dbg.obj): $(SRCDIR)/APInt-C.h
 
 # archive library file rules
diff --git a/src/atomics.h b/src/atomics.h
index 493f0297892bc..ebfc66bbd83f4 100644
--- a/src/atomics.h
+++ b/src/atomics.h
@@ -62,8 +62,12 @@
 // the __atomic builtins or c11 atomics with GNU extension or c11 _Generic
 #  define jl_atomic_compare_exchange(obj, expected, desired)    \
     __sync_val_compare_and_swap(obj, expected, desired)
+#  define jl_atomic_bool_compare_exchange(obj, expected, desired)          \
+    __sync_bool_compare_and_swap(obj, expected, desired)
 #  define jl_atomic_exchange(obj, desired)              \
     __atomic_exchange_n(obj, desired, __ATOMIC_SEQ_CST)
+#  define jl_atomic_exchange_generic(obj, desired, orig)\
+    __atomic_exchange(obj, desired, orig, __ATOMIC_SEQ_CST)
 #  define jl_atomic_exchange_relaxed(obj, desired)      \
     __atomic_exchange_n(obj, desired, __ATOMIC_RELAXED)
 // TODO: Maybe add jl_atomic_compare_exchange_weak for spin lock
@@ -115,6 +119,7 @@ jl_atomic_fetch_add(T *obj, T2 arg)
 {
     return (T)_InterlockedExchangeAdd64((volatile __int64*)obj, (__int64)arg);
 }
+// TODO: jl_atomic_exchange_generic
 #define jl_atomic_fetch_add_relaxed(obj, arg) jl_atomic_fetch_add(obj, arg)
 
 // and
@@ -200,6 +205,7 @@ jl_atomic_compare_exchange(volatile T *obj, T2 expected, T3 desired)
     return (T)_InterlockedCompareExchange64((volatile __int64*)obj,
                                             (__int64)desired, (__int64)expected);
 }
+// TODO: jl_atomic_bool_compare_exchange
 // atomic exchange
 template<typename T, typename T2>
 static inline typename std::enable_if<sizeof(T) == 1, T>::type
diff --git a/src/builtins.c b/src/builtins.c
index edf5dc35ff4c1..c00ba5e189ec0 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -1243,6 +1243,9 @@ void jl_init_primitives(void) JL_GC_DISABLED
     add_builtin("Ref", (jl_value_t*)jl_ref_type);
     add_builtin("Ptr", (jl_value_t*)jl_pointer_type);
     add_builtin("Task", (jl_value_t*)jl_task_type);
+#ifdef JULIA_ENABLE_PARTR
+    add_builtin("Condition", (jl_value_t*)jl_condition_type);
+#endif
 
     add_builtin("AbstractArray", (jl_value_t*)jl_abstractarray_type);
     add_builtin("DenseArray", (jl_value_t*)jl_densearray_type);
diff --git a/src/dump.c b/src/dump.c
index d29ae8565bdde..c54cfc4af11e6 100644
--- a/src/dump.c
+++ b/src/dump.c
@@ -3191,7 +3191,10 @@ void jl_init_serializer(void)
                      jl_box_int64(12), jl_box_int64(13), jl_box_int64(14),
                      jl_box_int64(15), jl_box_int64(16), jl_box_int64(17),
                      jl_box_int64(18), jl_box_int64(19), jl_box_int64(20),
-                     jl_box_int64(21), jl_box_int64(22),
+                     jl_box_int64(21),
+#ifndef JULIA_ENABLE_PARTR
+                     jl_box_int64(22),
+#endif
 
                      jl_bool_type, jl_linenumbernode_type, jl_pinode_type,
                      jl_upsilonnode_type, jl_type_type, jl_bottom_type, jl_ref_type,
@@ -3205,6 +3208,9 @@ void jl_init_serializer(void)
                      jl_emptytuple_type, jl_array_uint8_type, jl_code_info_type,
                      jl_typeofbottom_type, jl_namedtuple_type, jl_array_int32_type,
                      jl_typedslot_type, jl_uint32_type, jl_uint64_type,
+#ifdef JULIA_ENABLE_PARTR
+                     jl_condition_type,
+#endif
 
                      ptls->root_task,
 
diff --git a/src/forkjoin-ti.c b/src/forkjoin-ti.c
new file mode 100644
index 0000000000000..8b7d2d3cc1620
--- /dev/null
+++ b/src/forkjoin-ti.c
@@ -0,0 +1,356 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "julia.h"
+#include "julia_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "options.h"
+#include "threading.h"
+
+#ifdef JULIA_ENABLE_THREADING
+#ifdef JULIA_ENABLE_FORKJOIN_TI
+
+// for the barrier
+typedef struct {
+    int sense;
+} ti_thread_sense_t;
+
+// thread group
+typedef struct {
+    int16_t *tid_map, num_threads, added_threads;
+    uint8_t num_sockets, num_cores, num_threads_per_core;
+
+    // fork/join/barrier
+    uint8_t group_sense; // Written only by master thread
+    ti_thread_sense_t **thread_sense;
+    void              *envelope;
+
+    // to let threads sleep
+    uv_mutex_t alarm_lock;
+    uv_cond_t  alarm;
+} ti_threadgroup_t;
+
+// thread state
+enum {
+    TI_THREAD_INIT,
+    TI_THREAD_WORK
+};
+
+// passed to thread function
+typedef struct {
+    int16_t volatile state;
+    ti_threadgroup_t *tg;
+} ti_threadarg_t;
+
+// work command to thread function
+typedef struct {
+    jl_method_instance_t *mfunc;
+    jl_callptr_t fptr;
+    jl_value_t **args;
+    uint32_t nargs;
+    jl_value_t *ret;
+    size_t world_age;
+} ti_threadwork_t;
+
+// for broadcasting work to threads
+static ti_threadwork_t threadwork;
+
+// only one thread group for now
+static ti_threadgroup_t *tgworld;
+
+extern uint64_t jl_thread_sleep_threshold;
+
+// threadgroup functions
+// ---
+static int ti_threadgroup_create(uint8_t num_sockets, uint8_t num_cores,
+                                 uint8_t num_threads_per_core,
+                                 ti_threadgroup_t **newtg)
+{
+    int i;
+    ti_threadgroup_t *tg;
+    int num_threads = num_sockets * num_cores * num_threads_per_core;
+
+    tg = (ti_threadgroup_t*)jl_malloc_aligned(sizeof(ti_threadgroup_t), 64);
+    tg->tid_map = (int16_t*)jl_malloc_aligned(num_threads * sizeof(int16_t), 64);
+    for (i = 0;  i < num_threads;  ++i)
+        tg->tid_map[i] = -1;
+    tg->num_sockets = num_sockets;
+    tg->num_cores = num_cores;
+    tg->num_threads_per_core = num_threads_per_core;
+    tg->num_threads = num_threads;
+    tg->added_threads = 0;
+    tg->thread_sense = (ti_thread_sense_t**)
+        jl_malloc_aligned(num_threads * sizeof(ti_thread_sense_t*), 64);
+    for (i = 0;  i < num_threads;  i++)
+        tg->thread_sense[i] = NULL;
+    jl_atomic_store_release(&tg->group_sense, 0);
+
+    uv_mutex_init(&tg->alarm_lock);
+    uv_cond_init(&tg->alarm);
+
+    *newtg = tg;
+    return 0;
+}
+
+static int ti_threadgroup_addthread(ti_threadgroup_t *tg, int16_t ext_tid,
+                                    int16_t *tgtid)
+{
+    if (ext_tid < 0 || ext_tid >= tg->num_threads)
+        return -1;
+    if (tg->tid_map[ext_tid] != -1)
+        return -2;
+    if (tg->added_threads == tg->num_threads)
+        return -3;
+
+    tg->tid_map[ext_tid] = tg->added_threads++;
+    if (tgtid) *tgtid = tg->tid_map[ext_tid];
+
+    return 0;
+}
+
+static int ti_threadgroup_initthread(ti_threadgroup_t *tg, int16_t ext_tid)
+{
+    ti_thread_sense_t *ts;
+
+    if (ext_tid < 0 || ext_tid >= tg->num_threads)
+        return -1;
+    if (tg->thread_sense[tg->tid_map[ext_tid]] != NULL)
+        return -2;
+    if (tg->num_threads == 0)
+        return -3;
+
+    ts = (ti_thread_sense_t*)jl_malloc_aligned(sizeof(ti_thread_sense_t), 64);
+    ts->sense = 1;
+    tg->thread_sense[tg->tid_map[ext_tid]] = ts;
+
+    return 0;
+}
+
+static int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val, int init)
+{
+    uint8_t *group_sense = &tg->group_sense;
+    int16_t tid = tg->tid_map[ext_tid];
+    int thread_sense = tg->thread_sense[tid]->sense;
+    if (tid == 0) {
+        tg->envelope = bcast_val ? *bcast_val : NULL;
+        // synchronize `tg->envelope` and `tg->group_sense`
+        jl_atomic_store_release(group_sense, thread_sense);
+
+        // if it's possible that threads are sleeping, signal them
+        if (jl_thread_sleep_threshold) {
+            uv_mutex_lock(&tg->alarm_lock);
+            uv_cond_broadcast(&tg->alarm);
+            uv_mutex_unlock(&tg->alarm_lock);
+        }
+    }
+    else {
+        // spin up to threshold ns (count sheep), then sleep
+        uint64_t spin_ns;
+        uint64_t spin_start = 0;
+        // synchronize `tg->envelope` and `tg->group_sense`
+        while (jl_atomic_load_acquire(group_sense) != thread_sense) {
+            if (jl_thread_sleep_threshold) {
+                if (!spin_start) {
+                    // Lazily initialize spin_start since uv_hrtime is expensive
+                    spin_start = uv_hrtime();
+                    continue;
+                }
+                spin_ns = uv_hrtime() - spin_start;
+                // In case uv_hrtime is not monotonic, we'll sleep earlier
+                if (init || spin_ns >= jl_thread_sleep_threshold) {
+                    uv_mutex_lock(&tg->alarm_lock);
+                    if (jl_atomic_load_acquire(group_sense) != thread_sense) {
+                        uv_cond_wait(&tg->alarm, &tg->alarm_lock);
+                    }
+                    uv_mutex_unlock(&tg->alarm_lock);
+                    spin_start = 0;
+                    init = 0;
+                    continue;
+                }
+            }
+            jl_cpu_pause();
+        }
+        if (bcast_val)
+            *bcast_val = tg->envelope;
+    }
+
+    return 0;
+}
+
+static int ti_threadgroup_join(ti_threadgroup_t *tg, int16_t ext_tid)
+{
+    int *p_thread_sense = &tg->thread_sense[tg->tid_map[ext_tid]]->sense;
+    jl_atomic_store_release(p_thread_sense, !*p_thread_sense);
+    if (tg->tid_map[ext_tid] == 0) {
+        jl_ptls_t ptls = jl_get_ptls_states();
+        int8_t group_sense = tg->group_sense;
+        for (int i = 1; i < tg->num_threads; ++i) {
+            while (jl_atomic_load_acquire(&tg->thread_sense[i]->sense) == group_sense) {
+                jl_gc_safepoint_(ptls);
+                jl_cpu_pause();
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+// threading interface
+// ---
+void jl_init_threadinginfra(void) { }
+
+void jl_init_threadarg(jl_threadarg_t *targ)
+{
+    ti_threadarg_t *tiarg = (ti_threadarg_t *)malloc(sizeof (ti_threadarg_t));
+    tiarg->state = TI_THREAD_INIT;
+    targ->arg = (void *)tiarg;
+}
+
+void jl_init_started_threads(jl_threadarg_t **targs)
+{
+    // the analyzer doesn't know jl_n_threads doesn't change, help it
+    size_t nthreads = jl_n_threads;
+
+    // set up the world thread group
+    ti_threadgroup_create(1, nthreads, 1, &tgworld);
+    for (int i = 0;  i < nthreads;  ++i)
+        ti_threadgroup_addthread(tgworld, i, NULL);
+
+    jl_ptls_t ptls = jl_get_ptls_states();
+    ti_threadgroup_initthread(tgworld, ptls->tid);
+
+    // give the threads the world thread group; they will block waiting for fork
+    for (int i = 0;  i < nthreads - 1;  ++i) {
+        ti_threadarg_t *tiarg = (ti_threadarg_t *)targs[i]->arg;
+        tiarg->tg = tgworld;
+        jl_atomic_store_release(&tiarg->state, TI_THREAD_WORK);
+    }
+}
+
+// thread function: used by all except the main thread
+void jl_threadfun(void *arg)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    jl_threadarg_t *targ = (jl_threadarg_t *)arg;
+    ti_threadarg_t *tiarg = (ti_threadarg_t *)targ->arg;
+    ti_threadgroup_t *tg;
+    ti_threadwork_t *work;
+
+    // initialize this thread (set tid, create heap, etc.)
+    jl_init_threadtls(targ->tid);
+    void *stack_lo, *stack_hi;
+    jl_init_stack_limits(0, &stack_lo, &stack_hi);
+
+    // set up tasking
+    jl_init_root_task(stack_lo, stack_hi);
+
+    // wait for a thread group
+    while (jl_atomic_load_acquire(&tiarg->state) == TI_THREAD_INIT)
+        jl_cpu_pause();
+
+    // Assuming the functions called below don't contain unprotected GC
+    // critical region. In general, the following part of this function
+    // shouldn't call any managed code without calling `jl_gc_unsafe_enter`
+    // first.
+    jl_gc_state_set(ptls, JL_GC_STATE_SAFE, 0);
+    uv_barrier_wait(targ->barrier);
+
+    // initialize this thread in the thread group
+    tg = tiarg->tg;
+    ti_threadgroup_initthread(tg, ptls->tid);
+
+    // free the thread argument here
+    free(tiarg);
+    free(targ);
+
+    int init = 1;
+
+    // work loop
+    for (; ;) {
+        ti_threadgroup_fork(tg, ptls->tid, (void **)&work, init);
+        init = 0;
+
+        JL_GC_PROMISE_ROOTED(work);
+
+        if (work) {
+            // TODO: before we support getting return value from
+            //       the work, and after we have proper GC transition
+            //       support in the codegen and runtime we don't need to
+            //       enter GC unsafe region when starting the work.
+            int8_t gc_state = jl_gc_unsafe_enter(ptls);
+            size_t last_age = ptls->world_age;
+            ptls->world_age = work->world_age;
+            jl_thread_run_fun(work->fptr, work->mfunc, work->args, work->nargs);
+            ptls->world_age = last_age;
+            jl_gc_unsafe_leave(ptls, gc_state);
+        }
+
+        ti_threadgroup_join(tg, ptls->tid);
+    }
+}
+
+// interface to user code: specialize and compile the user thread function
+// and run it in all threads
+JL_DLLEXPORT jl_value_t *jl_threading_run(jl_value_t *_args)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    // GC safe
+    uint32_t nargs;
+    jl_value_t **args;
+    if (!jl_is_svec(_args)) {
+        nargs = 1;
+        args = &_args;
+    }
+    else {
+        nargs = jl_svec_len(_args);
+        args = jl_svec_data(_args);
+    }
+
+    int8_t gc_state = jl_gc_unsafe_enter(ptls);
+
+    size_t world = jl_get_ptls_states()->world_age;
+
+    threadwork.mfunc = jl_lookup_generic(args, nargs,
+                                         jl_int32hash_fast(jl_return_address()), ptls->world_age);
+    // Ignore constant return value for now.
+    threadwork.fptr = jl_compile_method_internal(&threadwork.mfunc, world);
+    if (threadwork.fptr == jl_fptr_const_return)
+        return jl_nothing;
+    threadwork.args = args;
+    threadwork.nargs = nargs;
+    threadwork.ret = jl_nothing;
+    threadwork.world_age = world;
+
+    // fork the world thread group
+    ti_threadwork_t *tw = &threadwork;
+    ti_threadgroup_fork(tgworld, ptls->tid, (void **)&tw, 0);
+
+    JL_GC_PROMISE_ROOTED(threadwork.mfunc);
+
+    // this thread must do work too
+    tw->ret = jl_thread_run_fun(threadwork.fptr, threadwork.mfunc, args, nargs);
+
+    // wait for completion
+    ti_threadgroup_join(tgworld, ptls->tid);
+
+    jl_gc_unsafe_leave(ptls, gc_state);
+
+    return tw->ret;
+}
+
+#endif // JULIA_ENABLE_FORKJOIN_TI
+#endif // JULIA_ENABLE_THREADING
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/forkjoin-ti.h b/src/forkjoin-ti.h
new file mode 100644
index 0000000000000..0b882cbb7eed4
--- /dev/null
+++ b/src/forkjoin-ti.h
@@ -0,0 +1,21 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef FORKJOINTI_H
+#define FORKJOINTI_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// interface provided by this threading infrastructure
+JL_DLLEXPORT jl_value_t *jl_threading_run(jl_value_t *_args);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* FORKJOINTI_H */
+
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 40dc55a4f0550..92c576cbec8bd 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -576,13 +576,15 @@ static void gc_scrub_range(char *low, char *high)
 
 static void gc_scrub_task(jl_task_t *ta)
 {
-    int16_t tid = ta->tid;
+    int16_t tid = ta->current_tid;
     jl_ptls_t ptls = jl_get_ptls_states();
-    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    jl_ptls_t ptls2 = NULL;
+    if (tid != -1)
+        ptls2 = jl_all_tls_states[tid];
 
     char *low;
     char *high;
-    if (ta->copy_stack && ta == ptls2->current_task) {
+    if (ta->copy_stack && ptls2 && ta == ptls2->current_task) {
         low  = (char*)ptls2->stackbase - ptls2->stacksize;
         high = (char*)ptls2->stackbase;
     }
@@ -593,7 +595,7 @@ static void gc_scrub_task(jl_task_t *ta)
     else
         return;
 
-    if (ptls == ptls2 && ta == ptls2->current_task) {
+    if (ptls == ptls2 && ptls2 && ta == ptls2->current_task) {
         // scan up to current `sp` for current thread and task
         low = (char*)jl_get_frame_addr();
     }
diff --git a/src/gc.c b/src/gc.c
index e5fb03cf95cc0..b3b2d236b856b 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1634,6 +1634,13 @@ STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_
     return (int)nptr;
 }
 
+#ifdef JULIA_ENABLE_PARTR
+int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj)
+{
+    return gc_mark_queue_obj(gc_cache, sp, obj);
+}
+#endif
+
 JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
 {
     return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj);
@@ -2330,8 +2337,10 @@ mark: {
             jl_task_t *ta = (jl_task_t*)new_obj;
             gc_scrub_record_task(ta);
             void *stkbuf = ta->stkbuf;
-            int16_t tid = ta->tid;
-            jl_ptls_t ptls2 = jl_all_tls_states[tid];
+            int16_t tid = ta->current_tid;
+            jl_ptls_t ptls2 = NULL;
+            if (tid != -1)
+                ptls2 = jl_all_tls_states[tid];
             if (gc_cblist_task_scanner) {
                 export_gc_state(ptls, &sp);
                 gc_invoke_callbacks(jl_gc_cb_task_scanner_t,
@@ -2347,7 +2356,7 @@ mark: {
             uintptr_t offset = 0;
             uintptr_t lb = 0;
             uintptr_t ub = (uintptr_t)-1;
-            if (ta == ptls2->current_task) {
+            if (ptls2  &&  ta == ptls2->current_task) {
                 s = ptls2->pgcstack;
             }
             else if (stkbuf) {
@@ -2481,12 +2490,21 @@ static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp
         gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception);
 }
 
+#ifdef JULIA_ENABLE_PARTR
+void jl_gc_mark_enqueued_tasks(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp);
+#endif
+
 // mark the initial root set
 static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
 {
     // modules
     gc_mark_queue_obj(gc_cache, sp, jl_main_module);
 
+#ifdef JULIA_ENABLE_PARTR
+    // tasks
+    jl_gc_mark_enqueued_tasks(gc_cache, sp);
+#endif
+
     // invisible builtin values
     if (jl_an_empty_vec_any != NULL)
         gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any);
diff --git a/src/init.c b/src/init.c
index cf3a420d76d98..2425b6c42ae4f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -611,7 +611,6 @@ void _julia_init(JL_IMAGE_SEARCH rel)
     // Make sure we finalize the tls callback before starting any threads.
     jl_get_ptls_states_getter();
 #endif
-    jl_ptls_t ptls = jl_get_ptls_states();
     jl_safepoint_init();
     libsupport_init();
     htable_new(&jl_current_modules, 0);
@@ -813,8 +812,10 @@ void jl_get_builtin_hooks(void)
     int t;
     for (t = 0; t < jl_n_threads; t++) {
         jl_ptls_t ptls2 = jl_all_tls_states[t];
-        ptls2->root_task->tls = jl_nothing;
+        ptls2->root_task->storage = jl_nothing;
+#ifndef JULIA_ENABLE_PARTR
         ptls2->root_task->donenotify = jl_nothing;
+#endif
         ptls2->root_task->exception = jl_nothing;
         ptls2->root_task->result = jl_nothing;
     }
diff --git a/src/julia.h b/src/julia.h
index fbc1eb2461b66..47d03c054f3ce 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -542,6 +542,9 @@ extern JL_DLLEXPORT jl_unionall_t *jl_anytuple_type_type JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT jl_unionall_t *jl_vararg_type JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT jl_typename_t *jl_vararg_typename JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT jl_datatype_t *jl_task_type JL_GLOBALLY_ROOTED;
+#ifdef JULIA_ENABLE_PARTR
+extern JL_DLLEXPORT jl_datatype_t *jl_condition_type JL_GLOBALLY_ROOTED;
+#endif
 extern JL_DLLEXPORT jl_datatype_t *jl_function_type JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT jl_datatype_t *jl_builtin_type JL_GLOBALLY_ROOTED;
 
@@ -1606,44 +1609,124 @@ typedef struct _jl_handler_t {
     size_t world_age;
 } jl_handler_t;
 
-typedef struct _jl_task_t {
+typedef struct _jl_task_t jl_task_t;
+
+#if defined(JULIA_ENABLE_PARTR)
+typedef struct _arriver_t arriver_t;
+typedef struct _reducer_t reducer_t;
+
+typedef struct _jl_taskq_t jl_taskq_t;
+typedef struct _jl_taskq_t jl_condition_t;
+
+struct _jl_taskq_t {
+    JL_DATA_TYPE
+
+    jl_task_t *head;
+    jl_mutex_t lock;
+};
+#endif
+
+struct _jl_task_t {
     JL_DATA_TYPE
-    jl_value_t *tls;
+
+    /* task local storage */
+    jl_value_t *storage;
+
+    /* state */
     jl_sym_t *state;
+
+#ifndef JULIA_ENABLE_PARTR
+    /* completion queue */
     jl_value_t *donenotify;
+#endif
+
+    /* execution result */
     jl_value_t *result;
     jl_value_t *exception;
     jl_value_t *backtrace;
     jl_value_t *logstate;
-    jl_function_t *start;
 
-// hidden state:
-    jl_ucontext_t ctx; // saved thread state
-    void *stkbuf; // malloc'd memory (either copybuf or stack)
-    size_t bufsz; // actual sizeof stkbuf
+    /* task entry point */
+    jl_function_t *taskentry;
+
+#ifdef JULIA_ENABLE_PARTR
+    /* reduction function entry point */
+    jl_function_t *redentry;
+
+    /* completion queue */
+    jl_taskq_t cq;
+
+    /* to link this task into queues */
+    jl_task_t *next;
+
+    /* parent (first) task of a parfor set */
+    jl_task_t *parent;
+
+    /* parfor reduction result */
+    jl_value_t *redresult;
+#endif
+    /* --- hidden --- */
+
+    /* context and stack */
+    jl_ucontext_t ctx;          // saved thread state
+    void *stkbuf;               // malloc'd memory (either copybuf or stack)
+    size_t bufsz;               // actual sizeof stkbuf
     unsigned int copy_stack:31; // sizeof stack for copybuf
     unsigned int started:1;
 
-    // current exception handler
+    /* current exception handler */
     jl_handler_t *eh;
-    // saved gc stack top for context switches
+
+    /* saved gc stack top for context switches */
     jl_gcframe_t *gcstack;
+
     // saved exception stack
     jl_excstack_t *excstack;
     // current world age
     size_t world_age;
 
-    // id of owning thread
-    // does not need to be defined until the task runs
-    int16_t tid;
+    /* thread currently running this task */
+    int16_t current_tid;
 #ifdef JULIA_ENABLE_THREADING
-    // This is statically initialized when the task is not holding any locks
     arraylist_t locks;
+#endif
+#ifdef JULIA_ENABLE_PARTR
+    /* grain's range, for parfors */
+    int64_t start, end;
+
+    /* to synchronize/reduce grains of a parfor */
+    arriver_t *arr;
+    reducer_t *red;
+
+    /* tid of the thread to which this task is sticky */
+    int16_t sticky_tid;
+
+    /* the index of this task in the set of grains of a parfor */
+    int16_t grain_num;
+
+    /* for the multiqueue */
+    int16_t prio;
 #endif
     jl_timing_block_t *timing_stack;
-} jl_task_t;
+};
 
 JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize);
+
+#ifdef JULIA_ENABLE_PARTR
+
+JL_DLLEXPORT jl_task_t *jl_task_spawn(jl_task_t *task, jl_value_t *arg, int8_t err,
+                                      int8_t unyielding, int8_t sticky);
+JL_DLLEXPORT jl_task_t *jl_task_new_multi(jl_value_t *args, size_t ssize,
+                                          int64_t count, jl_value_t *rargs);
+JL_DLLEXPORT int jl_task_spawn_multi(jl_task_t *task);
+JL_DLLEXPORT jl_value_t *jl_task_sync(jl_task_t *task);
+JL_DLLEXPORT jl_value_t *jl_task_yield(int requeue);
+JL_DLLEXPORT jl_condition_t *jl_condition_new(void);
+JL_DLLEXPORT jl_value_t *jl_task_wait(jl_condition_t *c);
+JL_DLLEXPORT void jl_task_notify(jl_condition_t *c, jl_value_t *arg, int8_t all, int8_t err);
+
+#endif // !JULIA_ENABLE_PARTR
+
 JL_DLLEXPORT void jl_switchto(jl_task_t **pt);
 JL_DLLEXPORT void JL_NORETURN jl_throw(jl_value_t *e JL_MAYBE_UNROOTED);
 JL_DLLEXPORT void JL_NORETURN jl_rethrow(void);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index af463a86fc5f4..7d29bcfda695b 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -500,7 +500,6 @@ extern ssize_t jl_tls_offset;
 extern const int jl_tls_elf_support;
 void jl_init_threading(void);
 void jl_start_threads(void);
-void jl_shutdown_threading(void);
 
 // Whether the GC is running
 extern char *jl_safepoint_pages;
@@ -706,6 +705,22 @@ void jl_copy_excstack(jl_excstack_t *dest, jl_excstack_t *src) JL_NOTSAFEPOINT;
 // Returns time in nanosec
 JL_DLLEXPORT uint64_t jl_hrtime(void);
 
+// congruential random number generator
+STATIC_INLINE void seed_cong(uint64_t *seed)
+{
+    *seed = jl_hrtime();
+}
+STATIC_INLINE void unbias_cong(uint64_t max, uint64_t *unbias)
+{
+    *unbias = UINT64_MAX - ((UINT64_MAX % max)+1);
+}
+STATIC_INLINE uint64_t cong(uint64_t max, uint64_t unbias, uint64_t *seed)
+{
+    while ((*seed = 69069 * (*seed) + 362437) > unbias)
+        ;
+    return *seed % max;
+}
+
 // libuv stuff:
 JL_DLLEXPORT extern void *jl_dl_handle;
 JL_DLLEXPORT extern void *jl_RTLD_DEFAULT_handle;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index eedab742ef6b1..cadb7cf236091 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -53,7 +53,7 @@ typedef ucontext_t jl_ucontext_t;
 
 // Recursive spin lock
 typedef struct {
-    volatile unsigned long owner;
+    volatile uintptr_t owner;
     uint32_t count;
 } jl_mutex_t;
 
@@ -158,6 +158,10 @@ struct _jl_tls_states_t {
     jl_ucontext_t base_ctx; // base context of stack
     jl_jmp_buf *safe_restore;
     int16_t tid;
+#ifdef JULIA_ENABLE_PARTR
+    uint64_t rngseed;
+    struct _jl_taskq_t *sticky_taskq;
+#endif
     // Temp storage for exception thrown in signal handler. Not rooted.
     struct _jl_value_t *sig_exception;
     // Temporary backtrace buffer. Scanned for gc roots when bt_size > 0.
diff --git a/src/locks.h b/src/locks.h
index b030e8c20403f..bb53887164723 100644
--- a/src/locks.h
+++ b/src/locks.h
@@ -105,6 +105,22 @@ static inline void jl_mutex_lock(jl_mutex_t *lock)
     jl_gc_enable_finalizers(ptls, 0);
 }
 
+static inline int jl_mutex_trylock_nogc(jl_mutex_t *lock)
+{
+    unsigned long self = jl_thread_self();
+    unsigned long owner = jl_atomic_load_acquire(&lock->owner);
+    if (owner == self) {
+        lock->count++;
+        return 1;
+    }
+    if (owner == 0 &&
+        jl_atomic_compare_exchange(&lock->owner, 0, self) == 0) {
+        lock->count = 1;
+        return 1;
+    }
+    return 0;
+}
+
 /* Call this function for code that could be called from either a managed
    or an unmanaged thread */
 static inline void jl_mutex_lock_maybe_nogc(jl_mutex_t *lock)
diff --git a/src/options.h b/src/options.h
index 5a4fc70a1f102..3d64805a03954 100644
--- a/src/options.h
+++ b/src/options.h
@@ -129,6 +129,12 @@
 #define MACHINE_EXCLUSIVE_NAME          "JULIA_EXCLUSIVE"
 #define DEFAULT_MACHINE_EXCLUSIVE       0
 
+// threading infrastructure selection
+#ifndef JULIA_ENABLE_PARTR
+#define JULIA_ENABLE_FORKJOIN_TI        1
+
+#endif // !JULIA_ENABLE_PARTR
+
 
 // sanitizer defaults ---------------------------------------------------------
 
diff --git a/src/partr.c b/src/partr.c
new file mode 100644
index 0000000000000..8b8672eb56089
--- /dev/null
+++ b/src/partr.c
@@ -0,0 +1,1154 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include "julia.h"
+#include "julia_internal.h"
+#include "gc.h"
+#include "threading.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef JULIA_ENABLE_THREADING
+#ifdef JULIA_ENABLE_PARTR
+
+// empirically, finish_task needs about 64k stack space to infer/run
+// and additionally, gc-stack reserves 64k for the guard pages
+#if defined(MINSIGSTKSZ) && MINSIGSTKSZ > 131072
+#define MINSTKSZ MINSIGSTKSZ
+#else
+#define MINSTKSZ 131072
+#endif
+
+// task states and stack switching
+extern jl_sym_t *done_sym;
+extern jl_sym_t *failed_sym;
+extern jl_sym_t *runnable_sym;
+extern void jl_switchto(jl_task_t **pt);
+extern char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner);
+
+// the lovely task-done-hook hack
+extern jl_function_t *task_done_hook_func;
+
+// GC functions used
+extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
+                                         jl_gc_mark_sp_t *sp, jl_value_t *obj);
+
+// thread sleep threshold
+extern uint64_t jl_thread_sleep_threshold;
+
+// multiq
+// ---
+
+/* a task heap */
+typedef struct taskheap_tag {
+    jl_mutex_t lock;
+    jl_task_t **tasks;
+    int16_t ntasks, prio;
+} taskheap_t;
+
+/* multiqueue parameters */
+static const int16_t heap_d = 8;
+static const int heap_c = 4;
+
+/* size of each heap */
+static const int tasks_per_heap = 8192; // TODO: this should be smaller by default, but growable!
+
+/* the multiqueue's heaps */
+static taskheap_t *heaps;
+static int16_t heap_p;
+
+/* unbias state for the RNG */
+static uint64_t cong_unbias;
+
+/* for thread sleeping */
+static uv_mutex_t sleep_lock;
+static uv_cond_t  sleep_alarm;
+
+
+/*  multiq_init()
+ */
+static inline void multiq_init(void)
+{
+    heap_p = heap_c * jl_n_threads;
+    heaps = (taskheap_t *)calloc(heap_p, sizeof(taskheap_t));
+    for (int16_t i = 0;  i < heap_p;  ++i) {
+        jl_mutex_init(&heaps[i].lock);
+        heaps[i].tasks = (jl_task_t **)calloc(tasks_per_heap, sizeof(jl_task_t *));
+        heaps[i].ntasks = 0;
+        heaps[i].prio = INT16_MAX;
+    }
+    unbias_cong(heap_p, &cong_unbias);
+}
+
+
+/*  sift_up()
+ */
+static inline void sift_up(taskheap_t *heap, int16_t idx)
+{
+    if (idx > 0) {
+        int16_t parent = (idx-1)/heap_d;
+        if (heap->tasks[idx]->prio < heap->tasks[parent]->prio) {
+            jl_task_t *t = heap->tasks[parent];
+            heap->tasks[parent] = heap->tasks[idx];
+            heap->tasks[idx] = t;
+            sift_up(heap, parent);
+        }
+    }
+}
+
+
+/*  sift_down()
+ */
+static inline void sift_down(taskheap_t *heap, int16_t idx)
+{
+    if (idx < heap->ntasks) {
+        for (int16_t child = heap_d*idx + 1;
+                child < tasks_per_heap && child <= heap_d*idx + heap_d;
+                ++child) {
+            if (heap->tasks[child]
+                    &&  heap->tasks[child]->prio < heap->tasks[idx]->prio) {
+                jl_task_t *t = heap->tasks[idx];
+                heap->tasks[idx] = heap->tasks[child];
+                heap->tasks[child] = t;
+                sift_down(heap, child);
+            }
+        }
+    }
+}
+
+
+/*  multiq_insert()
+ */
+static inline int multiq_insert(jl_task_t *task, int16_t priority)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    uint64_t rn;
+
+    task->prio = priority;
+    do {
+        rn = cong(heap_p, cong_unbias, &ptls->rngseed);
+    } while (!jl_mutex_trylock_nogc(&heaps[rn].lock));
+
+    if (heaps[rn].ntasks >= tasks_per_heap) {
+        jl_mutex_unlock_nogc(&heaps[rn].lock);
+        jl_error("multiq insertion failed, increase #tasks per heap");
+        return -1;
+    }
+
+    heaps[rn].tasks[heaps[rn].ntasks++] = task;
+    sift_up(&heaps[rn], heaps[rn].ntasks-1);
+    jl_mutex_unlock_nogc(&heaps[rn].lock);
+    int16_t prio = jl_atomic_load(&heaps[rn].prio);
+    if (task->prio < prio)
+        jl_atomic_compare_exchange(&heaps[rn].prio, prio, task->prio);
+
+    return 0;
+}
+
+
+/*  multiq_deletemin()
+ */
+static inline jl_task_t *multiq_deletemin(void)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    uint64_t rn1 = 0, rn2;
+    int16_t i, prio1, prio2;
+    jl_task_t *task;
+
+    for (i = 0;  i < heap_p;  ++i) {
+        rn1 = cong(heap_p, cong_unbias, &ptls->rngseed);
+        rn2 = cong(heap_p, cong_unbias, &ptls->rngseed);
+        prio1 = jl_atomic_load(&heaps[rn1].prio);
+        prio2 = jl_atomic_load(&heaps[rn2].prio);
+        if (prio1 > prio2) {
+            prio1 = prio2;
+            rn1 = rn2;
+        }
+        else if (prio1 == prio2 && prio1 == INT16_MAX)
+            continue;
+        if (jl_mutex_trylock_nogc(&heaps[rn1].lock)) {
+            if (prio1 == heaps[rn1].prio)
+                break;
+            jl_mutex_unlock_nogc(&heaps[rn1].lock);
+        }
+    }
+    if (i == heap_p)
+        return NULL;
+
+    task = heaps[rn1].tasks[0];
+    heaps[rn1].tasks[0] = heaps[rn1].tasks[--heaps[rn1].ntasks];
+    heaps[rn1].tasks[heaps[rn1].ntasks] = NULL;
+    prio1 = INT16_MAX;
+    if (heaps[rn1].ntasks > 0) {
+        sift_down(&heaps[rn1], 0);
+        prio1 = heaps[rn1].tasks[0]->prio;
+    }
+    jl_atomic_store(&heaps[rn1].prio, prio1);
+    jl_mutex_unlock_nogc(&heaps[rn1].lock);
+
+    return task;
+}
+
+
+// sync trees
+// ---
+
+/* arrival tree */
+struct _arriver_t {
+    int16_t index, next_avail;
+    int16_t **tree;
+};
+
+/* reduction tree */
+struct _reducer_t {
+    int16_t index, next_avail;
+    jl_value_t ***tree;
+};
+
+
+/* pool of arrival trees */
+static arriver_t *arriverpool;
+static int16_t num_arrivers, num_arriver_tree_nodes, next_arriver;
+
+/* pool of reduction trees */
+static reducer_t *reducerpool;
+static int16_t num_reducers, num_reducer_tree_nodes, next_reducer;
+
+
+/*  synctreepool_init()
+ */
+static inline void synctreepool_init(void)
+{
+    num_arriver_tree_nodes = (GRAIN_K * jl_n_threads) - 1;
+    num_reducer_tree_nodes = (2 * GRAIN_K * jl_n_threads) - 1;
+
+    /* num_arrivers = ((GRAIN_K * jl_n_threads) ^ ARRIVERS_P) + 1 */
+    num_arrivers = GRAIN_K * jl_n_threads;
+    for (int i = 1;  i < ARRIVERS_P;  ++i)
+        num_arrivers = num_arrivers * num_arrivers;
+    ++num_arrivers;
+
+    num_reducers = num_arrivers * REDUCERS_FRAC;
+
+    /* allocate */
+    arriverpool = (arriver_t *)calloc(num_arrivers, sizeof (arriver_t));
+    next_arriver = 0;
+    for (int i = 0;  i < num_arrivers;  ++i) {
+        arriverpool[i].index = i;
+        arriverpool[i].next_avail = i + 1;
+        arriverpool[i].tree = (int16_t **)
+                jl_malloc_aligned(num_arriver_tree_nodes * sizeof (int16_t *), 64);
+        for (int j = 0;  j < num_arriver_tree_nodes;  ++j)
+            arriverpool[i].tree[j] = (int16_t *)jl_malloc_aligned(sizeof (int16_t), 64);
+    }
+    arriverpool[num_arrivers - 1].next_avail = -1;
+
+    reducerpool = (reducer_t *)calloc(num_reducers, sizeof (reducer_t));
+    next_reducer = 0;
+    for (int i = 0;  i < num_reducers;  ++i) {
+        reducerpool[i].index = i;
+        reducerpool[i].next_avail = i + 1;
+        reducerpool[i].tree = (jl_value_t ***)
+                jl_malloc_aligned(num_reducer_tree_nodes * sizeof (jl_value_t **), 64);
+        for (int j = 0;  j < num_reducer_tree_nodes;  ++j)
+            reducerpool[i].tree[j] = (jl_value_t **)jl_malloc_aligned(sizeof (jl_value_t *), 64);
+    }
+    if (num_reducers > 0)
+        reducerpool[num_reducers - 1].next_avail = -1;
+    else
+        next_reducer = -1;
+}
+
+
+/*  arriver_alloc()
+ */
+static inline arriver_t *arriver_alloc(void)
+{
+    int16_t candidate;
+    arriver_t *arr;
+
+    do {
+        candidate = jl_atomic_load(&next_arriver);
+        if (candidate == -1)
+            return NULL;
+        arr = &arriverpool[candidate];
+    } while (!jl_atomic_bool_compare_exchange(&next_arriver,
+                candidate, arr->next_avail));
+    return arr;
+}
+
+
+/*  arriver_free()
+ */
+static inline void arriver_free(arriver_t *arr)
+{
+    for (int i = 0;  i < num_arriver_tree_nodes;  ++i)
+        *arr->tree[i] = 0;
+
+    jl_atomic_exchange_generic(&next_arriver, &arr->index, &arr->next_avail);
+}
+
+
+/*  reducer_alloc()
+ */
+static inline reducer_t *reducer_alloc(void)
+{
+    int16_t candidate;
+    reducer_t *red;
+
+    do {
+        candidate = jl_atomic_load(&next_reducer);
+        if (candidate == -1)
+            return NULL;
+        red = &reducerpool[candidate];
+    } while (!jl_atomic_bool_compare_exchange(&next_reducer,
+                     candidate, red->next_avail));
+    return red;
+}
+
+
+/*  reducer_free()
+ */
+static inline void reducer_free(reducer_t *red)
+{
+    for (int i = 0;  i < num_reducer_tree_nodes;  ++i)
+        *red->tree[i] = 0;
+
+    jl_atomic_exchange_generic(&next_reducer, &red->index, &red->next_avail);
+}
+
+
+/*  last_arriver()
+ */
+static inline int last_arriver(arriver_t *arr, int idx)
+{
+    int arrived, aidx = idx + (GRAIN_K * jl_n_threads) - 1;
+
+    while (aidx > 0) {
+        --aidx;
+        aidx >>= 1;
+        arrived = jl_atomic_fetch_add(arr->tree[aidx], 1);
+        if (!arrived) return 0;
+    }
+
+    return 1;
+}
+
+
+#if 0
+/*  reduce()
+ */
+static inline jl_value_t *reduce(arriver_t *arr, reducer_t *red, jl_function_t *redfun,
+                                 jl_value_t *val, int idx)
+{
+    int arrived, aidx = idx + (GRAIN_K * jl_n_threads) - 1, ridx = aidx, nidx;
+
+    *red->tree[ridx] = val;
+    while (aidx > 0) {
+        --aidx;
+        aidx >>= 1;
+        arrived = jl_atomic_fetch_add(arr->tree[aidx], 1);
+        if (!arrived) return NULL;
+
+        /* neighbor has already arrived, get its value and reduce it */
+        nidx = ridx & 0x1 ? ridx + 1 : ridx - 1;
+        /* TODO: need to pass in val and red->tree[nidx] */
+        JL_TRY {
+            val = fptr(mfunc, rargs, nrargs);
+        }
+        JL_CATCH {
+            val = jl_current_exception();
+        }
+
+        /* move up the tree */
+        --ridx;
+        ridx >>= 1;
+        *red->tree[ridx] = val;
+    }
+
+    return val;
+}
+#endif
+
+// parallel task runtime
+// ---
+
+// sticky task queues need to be visible to all threads
+jl_taskq_t *sticky_taskqs;
+
+
+// initialize the threading infrastructure
+void jl_init_threadinginfra(void)
+{
+    /* initialize the synchronization trees pool and the multiqueue */
+    synctreepool_init();
+    multiq_init();
+
+    /* allocate sticky task queues */
+    sticky_taskqs = (jl_taskq_t *)jl_malloc_aligned(jl_n_threads * sizeof(jl_taskq_t), 64);
+
+    /* initialize the sleep mechanism */
+    uv_mutex_init(&sleep_lock);
+    uv_cond_init(&sleep_alarm);
+}
+
+
+// initialize the thread function argument
+void jl_init_threadarg(jl_threadarg_t *targ) { }
+
+
+// helper for final thread initialization
+static void init_started_thread(void)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    /* allocate this thread's sticky task queue pointer and initialize the lock */
+    seed_cong(&ptls->rngseed);
+    ptls->sticky_taskq = &sticky_taskqs[ptls->tid];
+    ptls->sticky_taskq->head = NULL;
+    JL_MUTEX_INIT(&ptls->sticky_taskq->lock);
+}
+
+
+// once the threads are started, perform any final initializations
+void jl_init_started_threads(jl_threadarg_t **targs)
+{
+    // master thread final initialization
+    init_started_thread();
+}
+
+
+static int run_next(void);
+
+
+// thread function: used by all except the main thread
+void jl_threadfun(void *arg)
+{
+    jl_threadarg_t *targ = (jl_threadarg_t *)arg;
+
+    // initialize this thread (set tid, create heap, set up root task)
+    jl_init_threadtls(targ->tid);
+    void *stack_lo, *stack_hi;
+    jl_init_stack_limits(0, &stack_lo, &stack_hi);
+    init_started_thread();
+    jl_init_root_task(stack_lo, stack_hi);
+
+    // Assuming the functions called below don't contain unprotected GC
+    // critical region. In general, the following part of this function
+    // shouldn't call any managed code without calling `jl_gc_unsafe_enter`
+    // first.
+    jl_ptls_t ptls = jl_get_ptls_states();
+    jl_gc_state_set(ptls, JL_GC_STATE_SAFE, 0);
+    uv_barrier_wait(targ->barrier);
+
+    // free the thread argument here
+    free(targ);
+
+    jl_current_task->state = done_sym;
+    run_next();
+
+    // shouldn't get here
+    gc_debug_critical_error();
+    abort();
+}
+
+
+// enqueue the specified task for execution
+static void enqueue_task(jl_task_t *task)
+{
+    /* sticky tasks go to the thread's sticky queue */
+    if (task->sticky_tid != -1) {
+        jl_taskq_t *taskq = &sticky_taskqs[task->sticky_tid];
+        JL_LOCK(&taskq->lock);
+        if (!taskq->head)
+            taskq->head = task;
+        else {
+            jl_task_t *pt = taskq->head;
+            while (pt->next)
+                pt = pt->next;
+            pt->next = task;
+        }
+        JL_UNLOCK(&taskq->lock);
+    }
+
+    /* all others go back into the multiq */
+    else
+        multiq_insert(task, task->prio);
+
+    /* stop the event loop */
+    uv_stop(jl_global_event_loop());
+
+    /* wake up threads */
+    if (jl_thread_sleep_threshold) {
+        uv_mutex_lock(&sleep_lock);
+        uv_cond_broadcast(&sleep_alarm);
+        uv_mutex_unlock(&sleep_lock);
+    }
+}
+
+
+// parfor grains must synchronize/reduce as they end
+static void sync_grains(jl_task_t *task)
+{
+    int was_last = 0;
+
+    /* TODO kp: fix */
+    /* TODO kp: cascade exception(s) if any */
+
+    /* reduce... */
+    if (task->red) {
+        //task->result = reduce(task->arr, task->red, task->rfptr, task->mredfunc,
+        //                      task->rargs, task->result, task->grain_num);
+        jl_gc_wb(task, task->result);
+
+        /*  if this task is last, set the result in the parent task */
+        if (task->result) {
+            task->parent->redresult = task->result;
+            jl_gc_wb(task->parent, task->parent->redresult);
+            was_last = 1;
+        }
+    }
+    /* ... or just sync */
+    else {
+        if (last_arriver(task->arr, task->grain_num))
+            was_last = 1;
+    }
+
+    /* the last task to finish needs to finish up the loop */
+    if (was_last) {
+        /* a non-parent task must wake up the parent */
+        if (task->grain_num > 0)
+            enqueue_task(task->parent);
+
+        /* this is the parent task which was last; it can just end */
+        if (task->red)
+            reducer_free(task->red);
+        arriver_free(task->arr);
+    }
+    else {
+        /* the parent task needs to wait */
+        if (task->grain_num == 0) {
+            jl_task_yield(0);
+            task->result = task->redresult;
+            jl_gc_wb(task, task->result);
+        }
+    }
+}
+
+
+// all tasks except the root task start and exit here
+void NOINLINE JL_NORETURN start_task(void)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    jl_task_t *task = ptls->current_task;
+    task->started = 1;
+
+    jl_sym_t *new_state;
+
+    if (task->exception != jl_nothing) {
+        ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE);
+        jl_push_excstack(&task->excstack, task->exception,
+                         ptls->bt_data, ptls->bt_size);
+        task->result = task->exception;
+        jl_gc_wb(task, task->result);
+        new_state = failed_sym;
+    }
+    else {
+        JL_TRY {
+            if (ptls->defer_signal) {
+                ptls->defer_signal = 0;
+                jl_sigint_safepoint(ptls);
+            }
+            JL_TIMING(ROOT);
+            ptls->world_age = jl_world_counter;
+            task->result = jl_apply(&task->taskentry, 1);
+            jl_gc_wb(task, task->result);
+            new_state = done_sym;
+        }
+        JL_CATCH {
+            task->result = task->exception = jl_current_exception();
+            jl_gc_wb(task, task->exception);
+            jl_gc_wb(task, task->result);
+            new_state = failed_sym;
+            goto skip_pop_exception;
+        }
+skip_pop_exception:;
+    }
+
+    /* grain tasks must synchronize */
+    if (task->grain_num >= 0)
+        sync_grains(task);
+
+    /* add back any tasks in this one's completion queue */
+    JL_LOCK(&task->cq.lock);
+    jl_task_t *qtask = task->cq.head;
+    task->cq.head = NULL;
+    JL_UNLOCK(&task->cq.lock);
+    jl_task_t *qnext;
+    while (qtask) {
+        qnext = qtask->next;
+        qtask->next = NULL;
+        enqueue_task(qtask);
+        qtask = qnext;
+    }
+
+    JL_SIGATOMIC_BEGIN();
+
+    task->state = new_state;
+
+    if (task->copy_stack) // early free of stack
+        task->stkbuf = NULL;
+
+    /* clear thread state */
+    ptls->in_finalizer = 0;
+    ptls->in_pure_callback = 0;
+    ptls->world_age = jl_world_counter;
+
+    /* run the task-is-done hook(s) */
+    if (task_done_hook_func == NULL)
+        task_done_hook_func = (jl_function_t *)jl_get_global(jl_base_module,
+                                                             jl_symbol("task_done_hook"));
+    if (task_done_hook_func != NULL) {
+        jl_value_t *args[2] = {task_done_hook_func, (jl_value_t *)task};
+        JL_TRY {
+            jl_apply(args, 2);
+        }
+        JL_CATCH {
+            jl_no_exc_handler(jl_current_exception());
+        }
+    }
+
+    JL_SIGATOMIC_END();
+
+    /* next task */
+    run_next();
+
+    /* shouldn't reach here */
+    gc_debug_critical_error();
+    abort();
+}
+
+
+// get the next runnable task
+static jl_task_t *get_next_task(void)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    jl_task_t *task = NULL;
+    JL_GC_PUSH1(&task);
+
+    /* first check for sticky tasks */
+    JL_LOCK(&ptls->sticky_taskq->lock);
+    task = ptls->sticky_taskq->head;
+    if (task) {
+        ptls->sticky_taskq->head = task->next;
+        task->next = NULL;
+    }
+    JL_UNLOCK(&ptls->sticky_taskq->lock);
+
+    /* no sticky tasks, go to the multiq */
+    if (!task) task = multiq_deletemin();
+
+    JL_GC_POP();
+    return task;
+}
+
+
+// run the next available task
+// TODO: deal with the case where another thread gets the task from which a thread is
+// still trying to switch away
+static int run_next(void)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    jl_task_t *task = NULL;
+    JL_GC_PUSH1(&task);
+
+    uint64_t spin_ns, spin_start = 0;
+    while (!task) {
+        if (jl_thread_sleep_threshold) {
+            if (spin_start == 0) {
+                spin_start = uv_hrtime();
+                continue;
+            }
+        }
+
+        task = get_next_task();
+
+        if (!task) {
+            if (ptls->tid == 0)
+                jl_process_events(jl_global_event_loop());
+            else
+                jl_cpu_pause();
+
+            if (jl_thread_sleep_threshold) {
+                spin_ns = uv_hrtime() - spin_start;
+                if (spin_ns > jl_thread_sleep_threshold) {
+                    uv_mutex_lock(&sleep_lock);
+                    task = get_next_task();
+                    if (!task) {
+                        // thread 0 makes a blocking call to the event loop
+                        if (ptls->tid == 0) {
+                            uv_mutex_unlock(&sleep_lock);
+                            jl_run_once(jl_global_event_loop());
+                        }
+                        // other threads just sleep
+                        else {
+                            uv_cond_wait(&sleep_alarm, &sleep_lock);
+                            uv_mutex_unlock(&sleep_lock);
+                        }
+                    }
+                    else uv_mutex_unlock(&sleep_lock);
+                    spin_start = 0;
+                }
+            }
+        }
+    }
+
+    jl_switchto(&task);
+
+    JL_GC_POP();
+    return 1;
+}
+
+
+// initialize a task
+static void init_task(jl_task_t *task, size_t ssize)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    task->started = 0;
+    task->storage = jl_nothing;
+    task->state = runnable_sym;
+    task->result = jl_nothing;
+    task->exception = jl_nothing;
+    task->backtrace = jl_nothing;
+    task->logstate = jl_nothing;
+    task->taskentry = NULL;
+    task->redentry = NULL;
+    task->cq.head = NULL;
+    JL_MUTEX_INIT(&task->cq.lock);
+    task->next = NULL;
+    //task->parent = ptls->current_task;
+    task->parent = NULL;
+    task->redresult = jl_nothing;
+
+    task->stkbuf = NULL;
+    task->copy_stack = 0;
+    if (ssize == 0) {
+        // stack size unspecified; use default
+#if defined(COPY_STACKS) && defined(ALWAYS_COPY_STACKS)
+        task->copy_stack = 1;
+        task->bufsz = 0;
+#else
+        task->bufsz = JL_STACK_SIZE;
+#endif
+    }
+    else {
+        // user requested stack of a certain size
+        if (ssize < MINSTKSZ)
+            ssize = MINSTKSZ;
+        task->bufsz = ssize;
+        task->stkbuf = jl_alloc_fiber(&task->ctx, &task->bufsz, task);
+        if (task->stkbuf == NULL)
+            jl_throw(jl_memory_exception);
+    }
+#if defined(JL_DEBUG_BUILD)
+    if (!task->copy_stack)
+        memset(&task->ctx, 0, sizeof(task->ctx));
+#endif
+#ifdef COPY_STACKS
+    if (task->copy_stack)
+        memcpy(&task->ctx, &ptls->base_ctx, sizeof(task->ctx));
+#endif
+
+    arraylist_new(&task->locks, 0);
+    task->eh = NULL;
+    task->gcstack = NULL;
+    task->excstack = NULL;
+    task->world_age = ptls->world_age;
+    task->current_tid = -1;
+    task->arr = NULL;
+    task->red = NULL;
+    task->sticky_tid = -1;
+    task->grain_num = -1;
+
+#ifdef ENABLE_TIMINGS
+    task->timing_stack = NULL;
+#endif
+}
+
+
+/*  jl_new_task() -- create a task for `f(arg)`
+
+    The created task can then be spawned.
+ */
+JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *_taskentry, size_t ssize)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    jl_task_t *task = (jl_task_t *)jl_gc_alloc(ptls, sizeof (jl_task_t), jl_task_type);
+    init_task(task, ssize);
+    task->taskentry = _taskentry;
+
+    return task;
+}
+
+
+/*  jl_task_spawn() -- enqueue a task for execution
+
+    If `sticky` is set, the task will only run on the current thread. Continues
+    the current task if `unyielding` is set or in a few other cases, otherwise
+    yields.
+ */
+JL_DLLEXPORT jl_task_t *jl_task_spawn(jl_task_t *task, jl_value_t *arg, int8_t err,
+                                      int8_t unyielding, int8_t sticky)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    if (task->state != runnable_sym)
+        jl_error("schedule: Task not runnable");
+
+    if (!task->started) {
+        task->prio = ptls->tid;
+        if (sticky) task->sticky_tid = ptls->tid;
+    }
+    if (err) {
+        task->exception = arg;
+        jl_gc_wb(task, task->exception);
+    }
+    else {
+        task->result = arg;
+        if (arg != jl_nothing)
+            jl_gc_wb(task, task->result);
+    }
+    enqueue_task(task);
+
+    /* Yielding here is important -- this is what allows depth first
+       scheduling. However, this breaks some assumptions made by parts of
+       the Julia runtime -- I/O and channels. So, we have to allow the caller
+       to disallow yielding. Also, if the task being scheduled has already
+       been started, we don't yield.
+     */
+    if (!unyielding
+            &&  !ptls->in_finalizer  // allow e.g. async printing from finalizers
+            &&  !task->started)
+        jl_task_yield(1);
+
+    return task;
+}
+
+
+/*  jl_task_new_multi() -- create multiple tasks for `f(arg)`
+
+    Create multiple tasks, each of which invokes `f(arg, start, end)` such
+    that the sum of `end-start` for all tasks is `count`. If `_redentry` is
+    specified, the return values from the tasks are reduced; the result can
+    be retrieved by sync'ing on the parent task which is returned. All the
+    tasks can be spawned by passing the parent task to `jl_task_spawn_multi()`.
+ */
+JL_DLLEXPORT jl_task_t *jl_task_new_multi(jl_function_t *_taskentry, size_t ssize,
+                                          int64_t count,
+                                          jl_function_t *_redentry)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    int64_t n = GRAIN_K * jl_n_threads;
+    lldiv_t each = lldiv(count, n);
+
+    /* allocate synchronization tree(s) */
+    arriver_t *arr = arriver_alloc();
+    if (arr == NULL)
+        return NULL;
+    reducer_t *red = NULL;
+    if (_redentry != NULL) {
+        red = reducer_alloc();
+        if (red == NULL) {
+            arriver_free(arr);
+            return NULL;
+        }
+    }
+
+    /* allocate (GRAIN_K * nthreads) tasks */
+    int64_t start = 0, end = start + each.quot + (each.rem ? 1 : 0);
+    jl_task_t *parent = (jl_task_t *)jl_gc_alloc(ptls, sizeof (jl_task_t), jl_task_type);
+    JL_GC_PUSH1(&parent);
+    init_task(parent, ssize);
+    parent->taskentry = _taskentry;
+    parent->redentry = _redentry;
+    parent->start = start;
+    parent->end = end;
+    parent->grain_num = 0;
+    parent->arr = arr;
+    parent->red = red;
+
+    jl_task_t *prev = parent, *task = NULL;
+    start = end;
+    for (int64_t i = 1;  i < n;  ++i) {
+        end = start + each.quot + (i < each.rem ? 1 : 0);
+
+        task = (jl_task_t *)jl_gc_alloc(ptls, sizeof (jl_task_t), jl_task_type);
+        prev->next = task;
+        jl_gc_wb(prev, prev->next);
+        init_task(task, ssize);
+        task->parent = parent;
+        task->taskentry = _taskentry;
+        task->redentry = _redentry;
+        task->start = start;
+        task->end = end;
+        task->grain_num = i;
+        task->arr = arr;
+        task->red = red;
+
+        prev = task;
+        start = end;
+    }
+
+    JL_GC_POP();
+    return parent;
+}
+
+
+/*  jl_task_spawn_multi() -- spawn multiple tasks
+
+    Spawns multiple tasks that were previously created with `jl_task_new_multi()`.
+    Yields.
+ */
+JL_DLLEXPORT int jl_task_spawn_multi(jl_task_t *task)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    /* enqueue (GRAIN_K * nthreads) tasks */
+    jl_task_t *t = task;
+    for (int64_t i = 0;  i < GRAIN_K * jl_n_threads;  ++i) {
+        if (!t) // TODO: this should never happen
+            return -1;
+        if (multiq_insert(t, ptls->tid) != 0) // TODO: raise an error?
+            return -2;
+        t = t->next;
+    }
+
+    /* yield to allow depth-first scheduling */
+    jl_task_yield(1);
+
+    return 0;
+}
+
+
+static void taskq_delete(jl_task_t **pnext, jl_task_t *tgt)
+{
+    jl_task_t *pt = *pnext;
+    while (pt) {
+        if (pt == tgt) {
+            *pnext = pt->next;
+            break;
+        }
+        pnext = &pt->next;
+        pt = *pnext;
+    }
+    tgt->next = NULL;
+}
+
+
+/*  jl_task_sync() -- get the return value of task `t`
+
+    Returns only when task `t` has completed.
+ */
+JL_DLLEXPORT jl_value_t *jl_task_sync(jl_task_t *task)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    if (task == ptls->current_task)
+        jl_error("cannot sync on self");
+
+    /* if the target task has not finished, add the current task to its
+       completion queue; the thread that runs the target task will add
+       this task back to the ready queue
+     */
+    if (task->state != done_sym  &&  task->state != failed_sym) {
+        // TODO: problem if a grain task does a sync?
+        ptls->current_task->next = NULL;
+        JL_LOCK(&task->cq.lock);
+
+        /* ensure the task didn't finish before we got the lock */
+        if (task->state != done_sym  &&  task->state != failed_sym) {
+            /* add the current task to the CQ */
+            if (!task->cq.head) {
+                task->cq.head = ptls->current_task;
+                jl_gc_wb(task, task->cq.head);
+            }
+            else {
+                jl_task_t *pt = task->cq.head;
+                while (pt->next)
+                    pt = pt->next;
+                pt->next = ptls->current_task;
+                jl_gc_wb(pt, pt->next);
+            }
+
+            JL_UNLOCK(&task->cq.lock);
+            JL_TRY {
+                jl_task_yield(0);
+            }
+            JL_CATCH {
+                taskq_delete(&task->cq.head, ptls->current_task);
+                jl_rethrow();
+            }
+        }
+
+        /* the task finished before we could add to its CQ */
+        else
+            JL_UNLOCK(&task->cq.lock);
+    }
+
+    if (task->state == failed_sym)
+        jl_throw(task->exception);
+
+    return task->grain_num >= 0 && task->red ?  task->redresult : task->result;
+}
+
+
+/*  jl_task_yield() -- cause the invoking task to yield
+
+    If `requeue` is set, the task is inserted into the relevant queue
+    (sticky or multiqueue), otherwise it is assumed it will be re-queued
+    in some other way (e.g. from another task's completion queue).
+ */
+JL_DLLEXPORT jl_value_t *jl_task_yield(int requeue)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+
+    if (ptls->in_finalizer)
+        jl_error("task switch not allowed from inside gc finalizer");
+    if (ptls->in_pure_callback)
+        jl_error("task switch not allowed from inside staged nor pure functions");
+
+    if (requeue)
+        enqueue_task(ptls->current_task);
+
+    // run the next available task
+    run_next();
+
+    // yielding task (eventually) continues
+    jl_value_t *exc = ptls->current_task->exception;
+    if (exc != jl_nothing) {
+        ptls->current_task->exception = jl_nothing;
+        jl_throw(exc);
+    }
+
+    jl_value_t *res = ptls->current_task->result;
+    ptls->current_task->result = jl_nothing;
+    return res;
+}
+
+
+/*  jl_condition_new() -- create a new Condition
+ */
+JL_DLLEXPORT jl_condition_t *jl_condition_new(void)
+{
+    jl_condition_t *cond = (jl_condition_t *)
+            jl_new_struct_uninit(jl_condition_type);
+    cond->head = NULL;
+    JL_GC_PUSH1(&cond);
+    JL_MUTEX_INIT(&cond->lock);
+    JL_GC_POP();
+
+    return cond;
+}
+
+
+/*  jl_task_wait() -- deschedules the task until the specified condition is
+        triggered
+ */
+JL_DLLEXPORT jl_value_t *jl_task_wait(jl_condition_t *c)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    JL_LOCK(&c->lock);
+    if (!c->head) {
+        c->head = ptls->current_task;
+        jl_gc_wb(c, c->head);
+    }
+    else {
+        jl_task_t *pt = c->head;
+        while (pt->next)
+            pt = pt->next;
+        pt->next = ptls->current_task;
+        jl_gc_wb(pt, pt->next);
+    }
+    JL_UNLOCK(&c->lock);
+    jl_value_t *val = NULL;
+    JL_TRY {
+        val = jl_task_yield(0);
+    }
+    JL_CATCH {
+        taskq_delete(&c->head, ptls->current_task);
+        jl_rethrow();
+    }
+    return val;
+}
+
+
+/*  jl_task_notify() -- triggers the specified condition, causing all tasks
+        waiting on it to become schedulable
+ */
+JL_DLLEXPORT void jl_task_notify(jl_condition_t *c, jl_value_t *arg, int8_t all, int8_t err)
+{
+    JL_LOCK(&c->lock);
+    jl_task_t *qtask = c->head;
+    if (all)
+        c->head = NULL;
+    else {
+        if (c->head) {
+            c->head = c->head->next;
+            qtask->next = NULL;
+        }
+    }
+    JL_UNLOCK(&c->lock);
+
+    jl_task_t *qnext;
+    while (qtask) {
+        qnext = qtask->next;
+        qtask->next = NULL;
+        if (err) {
+            qtask->exception = arg;
+            jl_gc_wb(qtask, qtask->exception);
+        }
+        else {
+            qtask->result = arg;
+            jl_gc_wb(qtask, qtask->result);
+        }
+        enqueue_task(qtask);
+        qtask = qnext;
+    }
+}
+
+
+JL_DLLEXPORT int jl_condition_isempty(jl_condition_t *c)
+{
+    return c->head ? 0 : 1;
+}
+
+
+void jl_gc_mark_enqueued_tasks(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
+{
+    for (int16_t i = 0;  i < heap_p;  ++i)
+        for (int16_t j = 0;  j < heaps[i].ntasks;  ++j)
+            jl_gc_mark_queue_obj_explicit(gc_cache, sp, (jl_value_t *)heaps[i].tasks[j]);
+    for (int16_t i = 0;  i < jl_n_threads;  ++i) {
+        jl_task_t *t = sticky_taskqs[i].head;
+        while (t) {
+            jl_gc_mark_queue_obj_explicit(gc_cache, sp, (jl_value_t *)t);
+            t = t->next;
+        }
+    }
+}
+
+#endif // JULIA_ENABLE_PARTR
+#endif // JULIA_ENABLE_THREADING
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/partr.h b/src/partr.h
new file mode 100644
index 0000000000000..a8c65da362a82
--- /dev/null
+++ b/src/partr.h
@@ -0,0 +1,46 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+/*  partr -- parallel tasks runtime options
+ */
+
+#ifndef PARTR_H
+#define PARTR_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef JULIA_ENABLE_PARTR
+
+#include "julia.h"
+
+
+/* multiq */
+#define MULTIQ_HEAP_C                   4
+    /* number of heaps = MULTIQ_HEAP_C * nthreads */
+#define MULTIQ_TASKS_PER_HEAP           129
+    /* how many in each heap */
+
+/* parfor */
+#define GRAIN_K                         4
+    /* tasks = niters / (GRAIN_K * nthreads) */
+
+/* synchronization */
+#define ARRIVERS_P                      2
+    /* narrivers = ((GRAIN_K * nthreads) ^ ARRIVERS_P) + 1
+       limit for number of recursive parfors */
+#define REDUCERS_FRAC                   1
+    /* nreducers = narrivers * REDUCERS_FRAC */
+
+
+#endif /* JULIA_ENABLE_PARTR */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PARTR_H */
+
diff --git a/src/staticdata.c b/src/staticdata.c
index 921abb4b770be..3bbc6af7441a9 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -1664,6 +1664,9 @@ static void jl_init_serializer2(int for_serialize)
                      jl_globalref_type->name, jl_typeofbottom_type->name,
                      jl_string_type->name, jl_abstractstring_type->name,
                      jl_namedtuple_type, jl_namedtuple_typename,
+#ifdef JULIA_ENABLE_PARTR
+                     jl_condition_type, jl_condition_type->name,
+#endif
 
                      jl_int32_type, jl_int64_type, jl_bool_type, jl_uint8_type,
                      jl_uint32_type, jl_uint64_type,
diff --git a/src/task.c b/src/task.c
index 3dce377a01a89..71d20805219d8 100644
--- a/src/task.c
+++ b/src/task.c
@@ -59,16 +59,22 @@ volatile int jl_in_stackwalk = 0;
 
 #define ROOT_TASK_STACK_ADJUSTMENT 3000000
 
-static jl_sym_t *done_sym;
-static jl_sym_t *failed_sym;
-static jl_sym_t *runnable_sym;
+jl_sym_t *done_sym;
+jl_sym_t *failed_sym;
+jl_sym_t *runnable_sym;
 
 extern size_t jl_page_size;
 jl_datatype_t *jl_task_type;
-static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner);
-static void jl_set_fiber(jl_ucontext_t *t);
-static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t);
-static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t);
+#ifdef JULIA_ENABLE_PARTR
+jl_datatype_t *jl_condition_type;
+
+void NOINLINE JL_NORETURN start_task(void);
+#endif
+
+char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner);
+void jl_set_fiber(jl_ucontext_t *t);
+void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t);
+void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t);
 
 #ifdef JL_HAVE_UNW_CONTEXT
 static JL_THREAD unw_cursor_t jl_basecursor;
@@ -84,7 +90,7 @@ static void memcpy_a16(uint64_t *to, uint64_t *from, size_t nb)
     //    *(to++) = *(from++);
 }
 
-static void NOINLINE save_stack(jl_ptls_t ptls, jl_task_t *lastt, jl_task_t **pt)
+void NOINLINE save_stack(jl_ptls_t ptls, jl_task_t *lastt, jl_task_t **pt)
 {
     char *frame_addr = (char*)((uintptr_t)jl_get_frame_addr() & ~15);
     char *stackbase = (char*)ptls->stackbase;
@@ -108,7 +114,7 @@ static void NOINLINE save_stack(jl_ptls_t ptls, jl_task_t *lastt, jl_task_t **pt
     jl_gc_wb_back(lastt);
 }
 
-static void NOINLINE JL_NORETURN restore_stack(jl_ptls_t ptls, char *p)
+void NOINLINE JL_NORETURN restore_stack(jl_ptls_t ptls, char *p)
 {
     jl_task_t *t = ptls->current_task;
     size_t nb = t->copy_stack;
@@ -126,7 +132,8 @@ static void NOINLINE JL_NORETURN restore_stack(jl_ptls_t ptls, char *p)
     jl_set_fiber(&t->ctx);
     abort(); // unreachable
 }
-static void restore_stack2(jl_ptls_t ptls, jl_task_t *lastt)
+
+void restore_stack2(jl_ptls_t ptls, jl_task_t *lastt)
 {
     jl_task_t *t = ptls->current_task;
     size_t nb = t->copy_stack;
@@ -137,8 +144,9 @@ static void restore_stack2(jl_ptls_t ptls, jl_task_t *lastt)
 }
 #endif
 
-static jl_function_t *task_done_hook_func = NULL;
+jl_function_t *task_done_hook_func=NULL;
 
+#ifndef JULIA_ENABLE_PARTR
 static void JL_NORETURN finish_task(jl_task_t *t, jl_value_t *resultval JL_MAYBE_UNROOTED)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
@@ -180,6 +188,7 @@ static void JL_NORETURN finish_task(jl_task_t *t, jl_value_t *resultval JL_MAYBE
     gc_debug_critical_error();
     abort();
 }
+#endif // JULIA_ENABLE_PARTR
 
 JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *tid)
 {
@@ -296,6 +305,11 @@ static void ctx_switch(jl_ptls_t ptls, jl_task_t **pt)
     ptls->world_age = t->world_age;
     t->gcstack = NULL;
     ptls->current_task = t;
+#ifdef JULIA_ENABLE_PARTR
+    if (!lastt->copy_stack)
+        lastt->current_tid = -1;
+    t->current_tid = ptls->tid;
+#endif
 
     jl_ucontext_t *lastt_ctx = (killed ? NULL : &lastt->ctx);
 #ifdef COPY_STACKS
@@ -338,6 +352,8 @@ JL_DLLEXPORT void jl_switchto(jl_task_t **pt)
     jl_ptls_t ptls = jl_get_ptls_states();
     jl_task_t *t = *pt;
     if (t == ptls->current_task) {
+        if (t->state != runnable_sym)
+            jl_error("trying to switch to done task from itself");
         return;
     }
     if (t->state == done_sym || t->state == failed_sym ||
@@ -360,6 +376,8 @@ JL_DLLEXPORT void jl_switchto(jl_task_t **pt)
         jl_sigint_safepoint(ptls);
 }
 
+jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block);
+
 JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e) JL_NOTSAFEPOINT
 {
     jl_printf(JL_STDERR, "fatal: error thrown and no exception handler available.\n");
@@ -452,6 +470,7 @@ JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED)
     throw_internal(NULL);
 }
 
+#ifndef JULIA_ENABLE_PARTR
 JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
@@ -475,9 +494,9 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize)
         if (t->stkbuf == NULL)
             jl_throw(jl_memory_exception);
     }
-    t->tls = jl_nothing;
+    t->storage = jl_nothing;
     t->state = runnable_sym;
-    t->start = start;
+    t->taskentry = start;
     t->result = jl_nothing;
     t->donenotify = jl_nothing;
     t->exception = jl_nothing;
@@ -486,11 +505,10 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize)
     t->logstate = ptls->current_task->logstate;
     // there is no active exception handler available on this stack yet
     t->eh = NULL;
-    t->tid = 0;
+    t->current_tid = 0;
     t->gcstack = NULL;
     t->excstack = NULL;
     t->stkbuf = NULL;
-    t->tid = 0;
     t->started = 0;
 #ifdef ENABLE_TIMINGS
     t->timing_stack = NULL;
@@ -507,8 +525,10 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize)
     if (t->copy_stack)
         memcpy(&t->ctx, &ptls->base_ctx, sizeof(t->ctx));
 #endif
+
     return t;
 }
+#endif // !JULIA_ENABLE_PARTR
 
 JL_DLLEXPORT jl_value_t *jl_get_current_task(void)
 {
@@ -519,6 +539,7 @@ JL_DLLEXPORT jl_value_t *jl_get_current_task(void)
 // Do one-time initializations for task system
 void jl_init_tasks(void) JL_GC_DISABLED
 {
+#ifndef JULIA_ENABLE_PARTR
     jl_task_type = (jl_datatype_t*)
         jl_new_datatype(jl_symbol("Task"),
                         NULL,
@@ -543,12 +564,57 @@ void jl_init_tasks(void) JL_GC_DISABLED
                                 jl_any_type,
                                 jl_any_type),
                         0, 1, 7);
+#else /* JULIA_ENABLE_PARTR */
+    jl_task_type = (jl_datatype_t*)
+        jl_new_datatype(jl_symbol("Task"), NULL, jl_any_type, jl_emptysvec,
+                        jl_perm_symsvec(14,
+                                        "storage",
+                                        "state",
+                                        "result",
+                                        "exception",
+                                        "backtrace",
+                                        "logstate",
+                                        "code",
+                                        "redentry",
+                                        "cq_head",
+                                        "cq_lock_owner",
+                                        "cq_lock_count",
+                                        "next",
+                                        "parent",
+                                        "redresult"),
+                        jl_svec(14,
+                                jl_any_type,
+                                jl_sym_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_long_type,
+                                jl_int32_type,
+                                jl_any_type,
+                                jl_any_type,
+                                jl_any_type),
+                        0, 1, 6);
+    jl_svecset(jl_task_type->types, 8, (jl_value_t*)jl_task_type);
+    jl_svecset(jl_task_type->types, 11, (jl_value_t*)jl_task_type);
+    jl_svecset(jl_task_type->types, 12, (jl_value_t*)jl_task_type);
+    jl_condition_type = (jl_datatype_t*)
+        jl_new_datatype(jl_symbol("Condition"), NULL, jl_any_type, jl_emptysvec,
+                        jl_perm_symsvec(3, "head", "lock_owner", "lock_count"),
+                        jl_svec(3, jl_task_type, jl_long_type, jl_int32_type),
+                        0, 1, 0);
+#endif /* JULIA_ENABLE_PARTR */
+
     done_sym = jl_symbol("done");
     failed_sym = jl_symbol("failed");
     runnable_sym = jl_symbol("runnable");
 }
 
-static void NOINLINE JL_NORETURN start_task(void)
+#ifndef JULIA_ENABLE_PARTR
+void NOINLINE JL_NORETURN start_task(void)
 {
     // this runs the first time we switch to a task
     jl_ptls_t ptls = jl_get_ptls_states();
@@ -558,7 +624,7 @@ static void NOINLINE JL_NORETURN start_task(void)
     if (t->exception != jl_nothing) {
         record_backtrace(ptls);
         jl_push_excstack(&t->excstack, t->exception,
-                          ptls->bt_data, ptls->bt_size);
+                         ptls->bt_data, ptls->bt_size);
         res = t->exception;
     }
     else {
@@ -569,7 +635,7 @@ static void NOINLINE JL_NORETURN start_task(void)
             }
             JL_TIMING(ROOT);
             ptls->world_age = jl_world_counter;
-            res = jl_apply(&t->start, 1);
+            res = jl_apply(&t->taskentry, 1);
         }
         JL_CATCH {
             res = jl_current_exception();
@@ -583,7 +649,7 @@ skip_pop_exception:;
     gc_debug_critical_error();
     abort();
 }
-
+#endif /* JULIA_ENABLE_PARTR */
 
 #if defined(JL_HAVE_UCONTEXT)
 #ifdef _OS_WINDOWS_
@@ -592,7 +658,7 @@ skip_pop_exception:;
 #define swapcontext jl_swapcontext
 #define makecontext jl_makecontext
 #endif
-static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
+char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
 {
 #ifndef _OS_WINDOWS_
     int r = getcontext(t);
@@ -612,22 +678,22 @@ static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
 #endif
     return (char*)stk;
 }
-static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
+void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 {
     if (lastt)
         swapcontext(lastt, t);
     else
         setcontext(t);
 }
-static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
+void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 {
     swapcontext(lastt, t);
 }
-static void jl_set_fiber(jl_ucontext_t *t)
+void jl_set_fiber(jl_ucontext_t *t)
 {
     setcontext(t);
 }
-static void jl_init_basefiber(size_t ssize)
+void jl_init_basefiber(size_t ssize)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL);
@@ -637,7 +703,7 @@ static void jl_init_basefiber(size_t ssize)
 #endif
 
 #if defined(JL_HAVE_UNW_CONTEXT)
-static void start_basefiber(void)
+void start_basefiber(void)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     if (jl_setjmp(ptls->base_ctx.uc_mcontext, 0))
@@ -656,7 +722,7 @@ static void start_basefiber(void)
 #else
 #error please define how to simulate a CALL on this platform
 #endif
-static char *jl_alloc_fiber(unw_context_t *t, size_t *ssize, jl_task_t *owner)
+char *jl_alloc_fiber(unw_context_t *t, size_t *ssize, jl_task_t *owner)
 {
     char *stkbuf = (char*)jl_malloc_stack(ssize, owner);
     if (stkbuf == NULL)
@@ -679,23 +745,23 @@ static char *jl_alloc_fiber(unw_context_t *t, size_t *ssize, jl_task_t *owner)
     }
     return stkbuf;
 }
-static void jl_start_fiber(unw_context_t *lastt, unw_context_t *t)
+void jl_start_fiber(unw_context_t *lastt, unw_context_t *t)
 {
     if (lastt && jl_setjmp(lastt->uc_mcontext, 0))
         return;
     unw_resume(&jl_basecursor); // (doesn't return)
 }
-static void jl_swap_fiber(unw_context_t *lastt, unw_context_t *t)
+void jl_swap_fiber(unw_context_t *lastt, unw_context_t *t)
 {
     if (jl_setjmp(lastt->uc_mcontext, 0))
         return;
     jl_longjmp(t->uc_mcontext, 1); // (doesn't return)
 }
-static void jl_set_fiber(unw_context_t *t)
+void jl_set_fiber(unw_context_t *t)
 {
     jl_longjmp(t->uc_mcontext, 1);
 }
-static void jl_init_basefiber(size_t ssize)
+void jl_init_basefiber(size_t ssize)
 {
     int r = unw_getcontext(&ptls->base_ctx);
     if (r != 0)
@@ -714,7 +780,7 @@ static void jl_init_basefiber(size_t ssize)
 #endif
 
 #if defined(JL_HAVE_ASM)
-static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
+char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
 {
     char *stkbuf = (char*)jl_malloc_stack(ssize, owner);
     if (stkbuf == NULL)
@@ -723,7 +789,7 @@ static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
     ((size_t*)t)[1] = *ssize; // stash the stack size somewhere for start_fiber
     return stkbuf;
 }
-static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
+void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 {
     if (lastt && jl_setjmp(lastt->uc_mcontext, 0))
         return;
@@ -770,17 +836,17 @@ static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 #endif
     __builtin_unreachable();
 }
-static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
+void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 {
     if (jl_setjmp(lastt->uc_mcontext, 0))
         return;
     jl_longjmp(t->uc_mcontext, 1); // (doesn't return)
 }
-static void jl_set_fiber(jl_ucontext_t *t)
+void jl_set_fiber(jl_ucontext_t *t)
 {
     jl_longjmp(t->uc_mcontext, 1);
 }
-static void jl_init_basefiber(size_t ssize)
+void jl_init_basefiber(size_t ssize)
 {
 #ifdef COPY_STACKS
     jl_ptls_t ptls = jl_get_ptls_states();
@@ -792,13 +858,13 @@ static void jl_init_basefiber(size_t ssize)
 #endif
 
 #if defined(JL_HAVE_SIGALTSTACK)
-static void start_basefiber(void)
+void start_basefiber(void)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     if (jl_setjmp(ptls->base_ctx.uc_mcontext, 0))
         start_task();
 }
-static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
+char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
 {
     stack_t uc_stack, osigstk;
     struct sigaction sa, osa;
@@ -852,23 +918,23 @@ static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner)
     memcpy(&ptls->base_ctx, &base_ctx, sizeof(ptls->base_ctx));
     return (char*)stk;
 }
-static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
+void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 {
     if (lastt && jl_setjmp(lastt->uc_mcontext, 0))
         return;
     jl_longjmp(t->uc_mcontext, 1); // (doesn't return)
 }
-static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
+void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t)
 {
     if (jl_setjmp(lastt->uc_mcontext, 0))
         return;
     jl_longjmp(t->uc_mcontext, 1); // (doesn't return)
 }
-static void jl_set_fiber(jl_ucontext_t *t)
+void jl_set_fiber(jl_ucontext_t *t)
 {
     jl_longjmp(t->uc_mcontext, 1);
 }
-static void jl_init_basefiber(size_t ssize)
+void jl_init_basefiber(size_t ssize)
 {
 #ifdef COPY_STACKS
     jl_ptls_t ptls = jl_get_ptls_states();
@@ -898,18 +964,31 @@ void jl_init_root_task(void *stack_lo, void *stack_hi)
     ptls->current_task->stkbuf = stack;
     ptls->current_task->bufsz = ssize;
     ptls->current_task->started = 1;
-    ptls->current_task->tls = jl_nothing;
+#ifdef JULIA_ENABLE_PARTR
+    ptls->current_task->redentry = NULL;
+    ptls->current_task->cq.head = NULL;
+    JL_MUTEX_INIT(&ptls->current_task->cq.lock);
+    ptls->current_task->next = NULL;
+    ptls->current_task->parent = ptls->current_task;
+    ptls->current_task->redresult = jl_nothing;
+    ptls->current_task->arr = NULL;
+    ptls->current_task->red = NULL;
+    ptls->current_task->sticky_tid = -1;
+    ptls->current_task->grain_num = -1;
+#else
+    ptls->current_task->donenotify = jl_nothing;
+#endif
+    ptls->current_task->current_tid = ptls->tid;
+    ptls->current_task->storage = jl_nothing;
+    ptls->current_task->taskentry = NULL;
     ptls->current_task->state = runnable_sym;
-    ptls->current_task->start = NULL;
     ptls->current_task->result = jl_nothing;
-    ptls->current_task->donenotify = jl_nothing;
     ptls->current_task->exception = jl_nothing;
     ptls->current_task->backtrace = jl_nothing;
     ptls->current_task->logstate = jl_nothing;
     ptls->current_task->eh = NULL;
     ptls->current_task->gcstack = NULL;
     ptls->current_task->excstack = NULL;
-    ptls->current_task->tid = ptls->tid;
 #ifdef JULIA_ENABLE_THREADING
     arraylist_new(&ptls->current_task->locks, 0);
 #endif
diff --git a/src/threadgroup.c b/src/threadgroup.c
deleted file mode 100644
index f2158423acc0e..0000000000000
--- a/src/threadgroup.c
+++ /dev/null
@@ -1,206 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-/*
-  threading infrastructure
-  . threadgroup abstraction
-  . fork/join/barrier
-*/
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "julia.h"
-#include "julia_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "options.h"
-#include "threadgroup.h"
-
-int ti_threadgroup_create(uint8_t num_sockets, uint8_t num_cores,
-                          uint8_t num_threads_per_core,
-                          ti_threadgroup_t **newtg)
-{
-    int i;
-    ti_threadgroup_t *tg;
-    int num_threads = num_sockets * num_cores * num_threads_per_core;
-    char *cp;
-
-    tg = (ti_threadgroup_t*)jl_malloc_aligned(sizeof(ti_threadgroup_t), 64);
-    tg->tid_map = (int16_t*)jl_malloc_aligned(num_threads * sizeof(int16_t), 64);
-    for (i = 0;  i < num_threads;  ++i)
-        tg->tid_map[i] = -1;
-    tg->num_sockets = num_sockets;
-    tg->num_cores = num_cores;
-    tg->num_threads_per_core = num_threads_per_core;
-    tg->num_threads = num_threads;
-    tg->added_threads = 0;
-    tg->thread_sense = (ti_thread_sense_t**)
-        jl_malloc_aligned(num_threads * sizeof(ti_thread_sense_t*), 64);
-    for (i = 0;  i < num_threads;  i++)
-        tg->thread_sense[i] = NULL;
-    jl_atomic_store_release(&tg->group_sense, 0);
-
-    uv_mutex_init(&tg->alarm_lock);
-    uv_cond_init(&tg->alarm);
-
-    tg->sleep_threshold = DEFAULT_THREAD_SLEEP_THRESHOLD;
-    cp = getenv(THREAD_SLEEP_THRESHOLD_NAME);
-    if (cp) {
-        if (!strncasecmp(cp, "infinite", 8))
-            tg->sleep_threshold = 0;
-        else
-            tg->sleep_threshold = (uint64_t)strtol(cp, NULL, 10);
-    }
-
-    *newtg = tg;
-    return 0;
-}
-
-int ti_threadgroup_addthread(ti_threadgroup_t *tg, int16_t ext_tid,
-                             int16_t *tgtid)
-{
-    if (ext_tid < 0 || ext_tid >= tg->num_threads)
-        return -1;
-    if (tg->tid_map[ext_tid] != -1)
-        return -2;
-    if (tg->added_threads == tg->num_threads)
-        return -3;
-
-    tg->tid_map[ext_tid] = tg->added_threads++;
-    if (tgtid) *tgtid = tg->tid_map[ext_tid];
-
-    return 0;
-}
-
-int ti_threadgroup_initthread(ti_threadgroup_t *tg, int16_t ext_tid)
-{
-    ti_thread_sense_t *ts;
-
-    if (ext_tid < 0 || ext_tid >= tg->num_threads)
-        return -1;
-    if (tg->thread_sense[tg->tid_map[ext_tid]] != NULL)
-        return -2;
-    if (tg->num_threads == 0)
-        return -3;
-
-    ts = (ti_thread_sense_t*)jl_malloc_aligned(sizeof(ti_thread_sense_t), 64);
-    ts->sense = 1;
-    tg->thread_sense[tg->tid_map[ext_tid]] = ts;
-
-    return 0;
-}
-
-int ti_threadgroup_member(ti_threadgroup_t *tg, int16_t ext_tid, int16_t *tgtid)
-{
-    if (ext_tid < 0 || ext_tid >= tg->num_threads)
-        return -1;
-    if (tg == NULL) {
-        if (tgtid) *tgtid = -1;
-        return -2;
-    }
-    if (tg->tid_map[ext_tid] == -1) {
-        if (tgtid) *tgtid = -1;
-        return -3;
-    }
-    if (tgtid) *tgtid = tg->tid_map[ext_tid];
-
-    return 0;
-}
-
-int ti_threadgroup_size(ti_threadgroup_t *tg, int16_t *tgsize)
-{
-    *tgsize = tg->num_threads;
-    return 0;
-}
-
-int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val, int init)
-{
-    uint8_t *group_sense = &tg->group_sense;
-    int16_t tid = tg->tid_map[ext_tid];
-    int thread_sense = tg->thread_sense[tid]->sense;
-    if (tid == 0) {
-        tg->envelope = bcast_val ? *bcast_val : NULL;
-        // synchronize `tg->envelope` and `tg->group_sense`
-        jl_atomic_store_release(group_sense, thread_sense);
-
-        // if it's possible that threads are sleeping, signal them
-        if (tg->sleep_threshold) {
-            uv_mutex_lock(&tg->alarm_lock);
-            uv_cond_broadcast(&tg->alarm);
-            uv_mutex_unlock(&tg->alarm_lock);
-        }
-    }
-    else {
-        // spin up to threshold ns (count sheep), then sleep
-        uint64_t spin_ns;
-        uint64_t spin_start = 0;
-        // synchronize `tg->envelope` and `tg->group_sense`
-        while (jl_atomic_load_acquire(group_sense) != thread_sense) {
-            if (tg->sleep_threshold) {
-                if (!spin_start) {
-                    // Lazily initialize spin_start since uv_hrtime is expensive
-                    spin_start = uv_hrtime();
-                    continue;
-                }
-                spin_ns = uv_hrtime() - spin_start;
-                // In case uv_hrtime is not monotonic, we'll sleep earlier
-                if (init || spin_ns >= tg->sleep_threshold) {
-                    uv_mutex_lock(&tg->alarm_lock);
-                    if (jl_atomic_load_acquire(group_sense) != thread_sense) {
-                        uv_cond_wait(&tg->alarm, &tg->alarm_lock);
-                    }
-                    uv_mutex_unlock(&tg->alarm_lock);
-                    spin_start = 0;
-                    init = 0;
-                    continue;
-                }
-            }
-            jl_cpu_pause();
-        }
-        if (bcast_val)
-            *bcast_val = tg->envelope;
-    }
-
-    return 0;
-}
-
-int ti_threadgroup_join(ti_threadgroup_t *tg, int16_t ext_tid)
-{
-    int *p_thread_sense = &tg->thread_sense[tg->tid_map[ext_tid]]->sense;
-    jl_atomic_store_release(p_thread_sense, !*p_thread_sense);
-    if (tg->tid_map[ext_tid] == 0) {
-        jl_ptls_t ptls = jl_get_ptls_states();
-        int8_t group_sense = tg->group_sense;
-        for (int i = 1; i < tg->num_threads; ++i) {
-            while (jl_atomic_load_acquire(&tg->thread_sense[i]->sense) == group_sense) {
-                jl_gc_safepoint_(ptls);
-                jl_cpu_pause();
-            }
-        }
-    }
-
-    return 0;
-}
-
-int ti_threadgroup_destroy(ti_threadgroup_t *tg)
-{
-    int i;
-
-    uv_mutex_destroy(&tg->alarm_lock);
-    uv_cond_destroy(&tg->alarm);
-
-    for (i = 0;  i < tg->num_threads;  i++)
-        jl_free_aligned(tg->thread_sense[i]);
-    jl_free_aligned(tg->thread_sense);
-    jl_free_aligned(tg->tid_map);
-    jl_free_aligned(tg);
-
-    return 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/threadgroup.h b/src/threadgroup.h
deleted file mode 100644
index 82fc59785cd05..0000000000000
--- a/src/threadgroup.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-#ifndef JL_THREADGROUP_H
-#define JL_THREADGROUP_H
-
-#include <stdint.h>
-#include "uv.h"
-
-// for the barrier
-typedef struct {
-    int sense;
-} ti_thread_sense_t;
-
-// thread group
-typedef struct {
-    int16_t *tid_map, num_threads, added_threads;
-    uint8_t num_sockets, num_cores, num_threads_per_core;
-
-    // fork/join/barrier
-    uint8_t group_sense; // Written only by master thread
-    ti_thread_sense_t **thread_sense;
-    void              *envelope;
-
-    // to let threads sleep
-    uv_mutex_t alarm_lock;
-    uv_cond_t  alarm;
-    uint64_t   sleep_threshold;
-} ti_threadgroup_t;
-
-int ti_threadgroup_create(uint8_t num_sockets, uint8_t num_cores,
-                          uint8_t num_threads_per_core,
-                          ti_threadgroup_t **newtg);
-int ti_threadgroup_addthread(ti_threadgroup_t *tg, int16_t ext_tid,
-                             int16_t *tgtid);
-int ti_threadgroup_initthread(ti_threadgroup_t *tg, int16_t ext_tid);
-int ti_threadgroup_member(ti_threadgroup_t *tg, int16_t ext_tid,
-                          int16_t *tgtid);
-int ti_threadgroup_size(ti_threadgroup_t *tg, int16_t *tgsize);
-int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid,
-                        void **bcast_val, int init);
-int ti_threadgroup_join(ti_threadgroup_t *tg, int16_t ext_tid);
-int ti_threadgroup_destroy(ti_threadgroup_t *tg);
-
-#endif  /* THREADGROUP_H */
diff --git a/src/threading.c b/src/threading.c
index 92c0eac214bc4..e13ee570ad312 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -1,18 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
-/*
-  threading infrastructure
-  . thread and threadgroup creation
-  . thread function
-  . invoke Julia function from multiple threads
-
-TODO:
-  . fix interface to properly support thread groups
-  . add queue per thread for tasks
-  . add reduction; reduce values returned from thread function
-  . make code generation thread-safe and remove the lock
-*/
-
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -47,7 +34,6 @@
 extern "C" {
 #endif
 
-#include "threadgroup.h"
 #include "threading.h"
 
 // The tls_states buffer:
@@ -240,9 +226,9 @@ JL_DLLEXPORT JL_CONST_FUNC jl_ptls_t (jl_get_ptls_states)(void)
 }
 #endif
 
-// thread ID
-JL_DLLEXPORT int jl_n_threads;     // # threads we're actually using
+JL_DLLEXPORT int jl_n_threads;
 jl_ptls_t *jl_all_tls_states;
+uint64_t jl_thread_sleep_threshold;
 
 // return calling thread's ID
 // Also update the suspended_threads list in signals-mach when changing the
@@ -253,10 +239,19 @@ JL_DLLEXPORT int16_t jl_threadid(void)
     return ptls->tid;
 }
 
-static void ti_initthread(int16_t tid)
+void jl_init_threadtls(int16_t tid)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
-#ifndef _OS_WINDOWS_
+#ifdef _OS_WINDOWS_
+    if (tid == 0) {
+        if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
+                             GetCurrentProcess(), &hMainThread, 0,
+                             FALSE, DUPLICATE_SAME_ACCESS)) {
+            jl_printf(JL_STDERR, "WARNING: failed to access handle to main thread\n");
+            hMainThread = INVALID_HANDLE_VALUE;
+        }
+    }
+#else
     ptls->system_id = pthread_self();
 #endif
     assert(ptls->world_age == 0);
@@ -293,24 +288,12 @@ static void ti_initthread(int16_t tid)
     jl_all_tls_states[tid] = ptls;
 }
 
-static void ti_init_master_thread(void)
-{
-#ifdef _OS_WINDOWS_
-    if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
-                         GetCurrentProcess(), &hMainThread, 0,
-                         FALSE, DUPLICATE_SAME_ACCESS)) {
-        jl_printf(JL_STDERR, "WARNING: failed to access handle to main thread\n");
-        hMainThread = INVALID_HANDLE_VALUE;
-    }
-#endif
-    ti_initthread(0);
-}
-
 // all threads call this function to run user code
-static jl_value_t *ti_run_fun(jl_callptr_t fptr, jl_method_instance_t *mfunc,
+jl_value_t *jl_thread_run_fun(jl_callptr_t fptr, jl_method_instance_t *mfunc,
                               jl_value_t **args, uint32_t nargs)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
+    jl_value_t *res = jl_nothing;
     JL_TRY {
         fptr(mfunc, args, nargs);
     }
@@ -323,129 +306,22 @@ static jl_value_t *ti_run_fun(jl_callptr_t fptr, jl_method_instance_t *mfunc,
         if (!jl_setjmp(buf, 0)) {
             // Set up the safe_restore context so that the printing uses the thread safe version
             ptls->safe_restore = &buf;
-            jl_printf(JL_STDERR, "\nError thrown in threaded loop on thread %d: ",
+            jl_printf(JL_STDERR, "\nError thrown in thread %d: ",
                       (int)ptls->tid);
             jl_static_show(JL_STDERR, jl_current_exception());
         }
         ptls->safe_restore = old_buf;
         JL_UNLOCK_NOGC(&lock);
     }
-    return jl_nothing;
+    return res;
 }
 
-
 // lock for code generation
 jl_mutex_t codegen_lock;
 jl_mutex_t typecache_lock;
 
 #ifdef JULIA_ENABLE_THREADING
 
-// only one thread group for now
-static ti_threadgroup_t *tgworld;
-
-// for broadcasting work to threads
-static ti_threadwork_t threadwork;
-
-#if PROFILE_JL_THREADING
-uint64_t prep_ns;
-uint64_t *fork_ns;
-uint64_t *user_ns;
-uint64_t *join_ns;
-#endif
-
-static uv_barrier_t thread_init_done;
-
-// thread function: used by all except the main thread
-void ti_threadfun(void *arg)
-{
-    jl_ptls_t ptls = jl_get_ptls_states();
-    ti_threadarg_t *ta = (ti_threadarg_t *)arg;
-    ti_threadgroup_t *tg;
-    ti_threadwork_t *work;
-
-    // initialize this thread (set tid, create heap, etc.)
-    ti_initthread(ta->tid);
-    void *stack_lo, *stack_hi;
-    jl_init_stack_limits(0, &stack_lo, &stack_hi);
-
-    // set up tasking
-    jl_init_root_task(stack_lo, stack_hi);
-
-    // set the thread-local tid and wait for a thread group
-    while (jl_atomic_load_acquire(&ta->state) == TI_THREAD_INIT)
-        jl_cpu_pause();
-
-    // Assuming the functions called below doesn't contain unprotected GC
-    // critical region. In general, the following part of this function
-    // shouldn't call any managed code without calling `jl_gc_unsafe_enter`
-    // first.
-    jl_gc_state_set(ptls, JL_GC_STATE_SAFE, 0);
-    uv_barrier_wait(&thread_init_done);
-    // initialize this thread in the thread group
-    tg = ta->tg;
-    ti_threadgroup_initthread(tg, ptls->tid);
-
-    // free the thread argument here
-    free(ta);
-
-    int init = 1;
-
-    // work loop
-    for (; ;) {
-#if PROFILE_JL_THREADING
-        uint64_t tstart = uv_hrtime();
-#endif
-
-        ti_threadgroup_fork(tg, ptls->tid, (void **)&work, init);
-        init = 0;
-        JL_GC_PROMISE_ROOTED(work);
-
-#if PROFILE_JL_THREADING
-        uint64_t tfork = uv_hrtime();
-        fork_ns[ptls->tid] += tfork - tstart;
-#endif
-
-        if (work) {
-            if (work->command == TI_THREADWORK_DONE) {
-                break;
-            }
-            else if (work->command == TI_THREADWORK_RUN) {
-                // TODO: return value? reduction?
-                // TODO: before we support getting return value from
-                //       the work, and after we have proper GC transition
-                //       support in the codegen and runtime we don't need to
-                //       enter GC unsafe region when starting the work.
-                int8_t gc_state = jl_gc_unsafe_enter(ptls);
-                // This is probably always NULL for now
-                size_t last_age = ptls->world_age;
-                ptls->world_age = work->world_age;
-                ti_run_fun(work->fptr, work->mfunc, work->args, work->nargs);
-                ptls->world_age = last_age;
-                jl_gc_unsafe_leave(ptls, gc_state);
-            }
-        }
-
-#if PROFILE_JL_THREADING
-        uint64_t tuser = uv_hrtime();
-        user_ns[ptls->tid] += tuser - tfork;
-#endif
-
-        ti_threadgroup_join(tg, ptls->tid);
-
-#if PROFILE_JL_THREADING
-        uint64_t tjoin = uv_hrtime();
-        join_ns[ptls->tid] += tjoin - tuser;
-#endif
-
-        // TODO:
-        // nowait should skip the join, but confirm that fork is reentrant
-    }
-}
-
-#if PROFILE_JL_THREADING
-void ti_reset_timings(void);
-#endif
-
 ssize_t jl_tls_offset = -1;
 
 #ifdef JL_ELF_TLS_VARIANT
@@ -556,36 +432,38 @@ void jl_init_threading(void)
     int max_threads = jl_cpu_threads();
     jl_n_threads = JULIA_NUM_THREADS;
     cp = getenv(NUM_THREADS_NAME);
-    if (cp) {
+    if (cp)
         jl_n_threads = (uint64_t)strtol(cp, NULL, 10);
-    }
     if (jl_n_threads > max_threads)
         jl_n_threads = max_threads;
     if (jl_n_threads <= 0)
         jl_n_threads = 1;
 
-    jl_all_tls_states = (jl_ptls_t*)malloc(jl_n_threads * sizeof(void*));
+    // thread sleep threshold
+    jl_thread_sleep_threshold = DEFAULT_THREAD_SLEEP_THRESHOLD;
+    cp = getenv(THREAD_SLEEP_THRESHOLD_NAME);
+    if (cp) {
+        if (!strncasecmp(cp, "infinite", 8))
+            jl_thread_sleep_threshold = 0;
+        else
+            jl_thread_sleep_threshold = (uint64_t)strtol(cp, NULL, 10);
+    }
 
-#if PROFILE_JL_THREADING
-    // set up space for profiling information
-    fork_ns = (uint64_t*)jl_malloc_aligned(jl_n_threads * sizeof(uint64_t), 64);
-    user_ns = (uint64_t*)jl_malloc_aligned(jl_n_threads * sizeof(uint64_t), 64);
-    join_ns = (uint64_t*)jl_malloc_aligned(jl_n_threads * sizeof(uint64_t), 64);
-    ti_reset_timings();
-#endif
+    jl_all_tls_states = (jl_ptls_t*)malloc(jl_n_threads * sizeof(void*));
 
-    // initialize this master thread (set tid, create heap, etc.)
-    ti_init_master_thread();
+    // initialize this thread (set tid, create heap, etc.)
+    jl_init_threadtls(0);
 }
 
+static uv_barrier_t thread_init_done;
+
 void jl_start_threads(void)
 {
-    jl_ptls_t ptls = jl_get_ptls_states();
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;
     uv_thread_t uvtid;
-    ti_threadarg_t **targs;
+    jl_threadarg_t **targs;
     if (cpumasksize < jl_n_threads) // also handles error case
         cpumasksize = jl_n_threads;
     char *mask = (char*)alloca(cpumasksize);
@@ -607,19 +485,23 @@ void jl_start_threads(void)
         mask[0] = 0;
     }
 
+    // initialize threading infrastructure
+    jl_init_threadinginfra();
+
     // The analyzer doesn't know jl_n_threads doesn't change, help it
     size_t nthreads = jl_n_threads;
 
     // create threads
-    targs = (ti_threadarg_t **)malloc((nthreads - 1) * sizeof (ti_threadarg_t *));
+    targs = (jl_threadarg_t **)malloc((nthreads - 1) * sizeof (jl_threadarg_t *));
 
     uv_barrier_init(&thread_init_done, nthreads);
 
     for (i = 0;  i < nthreads - 1;  ++i) {
-        targs[i] = (ti_threadarg_t *)malloc(sizeof (ti_threadarg_t));
-        targs[i]->state = TI_THREAD_INIT;
+        targs[i] = (jl_threadarg_t *)malloc(sizeof (jl_threadarg_t));
         targs[i]->tid = i + 1;
-        uv_thread_create(&uvtid, ti_threadfun, targs[i]);
+        targs[i]->barrier = &thread_init_done;
+        jl_init_threadarg(targs[i]);
+        uv_thread_create(&uvtid, jl_threadfun, targs[i]);
         if (exclusive) {
             mask[i + 1] = 1;
             uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize);
@@ -628,17 +510,7 @@ void jl_start_threads(void)
         uv_thread_detach(&uvtid);
     }
 
-    // set up the world thread group
-    ti_threadgroup_create(1, nthreads, 1, &tgworld);
-    for (i = 0;  i < nthreads;  ++i)
-        ti_threadgroup_addthread(tgworld, i, NULL);
-    ti_threadgroup_initthread(tgworld, ptls->tid);
-
-    // give the threads the world thread group; they will block waiting for fork
-    for (i = 0;  i < nthreads - 1;  ++i) {
-        targs[i]->tg = tgworld;
-        jl_atomic_store_release(&targs[i]->state, TI_THREAD_WORK);
-    }
+    jl_init_started_threads(targs);
 
     uv_barrier_wait(&thread_init_done);
 
@@ -646,155 +518,17 @@ void jl_start_threads(void)
     free(targs);
 }
 
-// TODO: is this needed? where/when/how to call it?
-void jl_shutdown_threading(void)
-{
-    jl_ptls_t ptls = jl_get_ptls_states();
-    // stop the spinning threads by sending them a command
-    ti_threadwork_t *work = &threadwork;
-
-    work->command = TI_THREADWORK_DONE;
-    ti_threadgroup_fork(tgworld, ptls->tid, (void **)&work, 0);
-
-    sleep(1);
-
-    // destroy the world thread group
-    ti_threadgroup_destroy(tgworld);
-
-#if PROFILE_JL_THREADING
-    jl_free_aligned(join_ns);
-    jl_free_aligned(user_ns);
-    jl_free_aligned(fork_ns);
-    fork_ns = user_ns = join_ns = NULL;
-#endif
-}
-
-// interface to user code: specialize and compile the user thread function
-// and run it in all threads
-JL_DLLEXPORT jl_value_t *jl_threading_run(jl_value_t *_args)
-{
-    jl_ptls_t ptls = jl_get_ptls_states();
-    // GC safe
-#if PROFILE_JL_THREADING
-    uint64_t tstart = uv_hrtime();
-#endif
-    uint32_t nargs;
-    jl_value_t **args;
-    if (!jl_is_svec(_args)) {
-        nargs = 1;
-        args = &_args;
-    }
-    else {
-        nargs = jl_svec_len(_args);
-        args = jl_svec_data(_args);
-    }
-
-    int8_t gc_state = jl_gc_unsafe_enter(ptls);
-
-    size_t world = jl_get_ptls_states()->world_age;
-    threadwork.command = TI_THREADWORK_RUN;
-    threadwork.mfunc = jl_lookup_generic(args, nargs,
-                                         jl_int32hash_fast(jl_return_address()), world);
-    // Ignore constant return value for now.
-    threadwork.fptr = jl_compile_method_internal(&threadwork.mfunc, world);
-    if (threadwork.fptr == jl_fptr_const_return)
-        return jl_nothing;
-    threadwork.args = args;
-    threadwork.nargs = nargs;
-    threadwork.ret = jl_nothing;
-    threadwork.world_age = world;
-
-#if PROFILE_JL_THREADING
-    uint64_t tcompile = uv_hrtime();
-    prep_ns += (tcompile - tstart);
-#endif
-
-    // fork the world thread group
-    ti_threadwork_t *tw = &threadwork;
-    ti_threadgroup_fork(tgworld, ptls->tid, (void **)&tw, 0);
-
-#if PROFILE_JL_THREADING
-    uint64_t tfork = uv_hrtime();
-    fork_ns[ptls->tid] += (tfork - tcompile);
-#endif
-
-    // this thread must do work too (TODO: reduction?)
-    JL_GC_PROMISE_ROOTED(threadwork.mfunc);
-    tw->ret = ti_run_fun(threadwork.fptr, threadwork.mfunc, args, nargs);
-
-#if PROFILE_JL_THREADING
-    uint64_t trun = uv_hrtime();
-    user_ns[ptls->tid] += (trun - tfork);
-#endif
-
-    // wait for completion (TODO: nowait?)
-    ti_threadgroup_join(tgworld, ptls->tid);
-
-#if PROFILE_JL_THREADING
-    uint64_t tjoin = uv_hrtime();
-    join_ns[ptls->tid] += (tjoin - trun);
-#endif
-
-    jl_gc_unsafe_leave(ptls, gc_state);
-
-    return tw->ret;
-}
-
-#if PROFILE_JL_THREADING
-
-void ti_reset_timings(void)
-{
-    int i;
-    prep_ns = 0;
-    for (i = 0;  i < jl_n_threads;  i++)
-        fork_ns[i] = user_ns[i] = join_ns[i] = 0;
-}
-
-void ti_timings(uint64_t *times, uint64_t *min, uint64_t *max, uint64_t *avg)
-{
-    int i;
-    *min = UINT64_MAX;
-    *max = *avg = 0;
-    for (i = 0;  i < jl_n_threads;  i++) {
-        if (times[i] < *min)
-            *min = times[i];
-        if (times[i] > *max)
-            *max = times[i];
-        *avg += times[i];
-    }
-    *avg /= jl_n_threads;
-}
-
-#define NS_TO_SECS(t)        ((t) / (double)1e9)
-
-JL_DLLEXPORT void jl_threading_profile(void)
-{
-    if (!fork_ns) return;
-
-    printf("\nti profile:\n");
-    printf("prep: %g (%" PRIu64 ")\n", NS_TO_SECS(prep_ns), prep_ns);
-
-    uint64_t min, max, avg;
-    ti_timings(fork_ns, &min, &max, &avg);
-    printf("fork: %g (%g - %g)\n", NS_TO_SECS(min), NS_TO_SECS(max),
-            NS_TO_SECS(avg));
-    ti_timings(user_ns, &min, &max, &avg);
-    printf("user: %g (%g - %g)\n", NS_TO_SECS(min), NS_TO_SECS(max),
-            NS_TO_SECS(avg));
-    ti_timings(join_ns, &min, &max, &avg);
-    printf("join: %g (%g - %g)\n", NS_TO_SECS(min), NS_TO_SECS(max),
-            NS_TO_SECS(avg));
-}
-
-#else //!PROFILE_JL_THREADING
+#else // !JULIA_ENABLE_THREADING
 
-JL_DLLEXPORT void jl_threading_profile(void)
+void jl_init_threading(void)
 {
+    static jl_ptls_t _jl_all_tls_states;
+    jl_all_tls_states = &_jl_all_tls_states;
+    jl_n_threads = 1;
+    jl_init_threadtls(0);
 }
 
-#endif //!PROFILE_JL_THREADING
-
-#else // !JULIA_ENABLE_THREADING
+void jl_start_threads(void) { }
 
 JL_DLLEXPORT jl_value_t *jl_threading_run(jl_value_t *_args)
 {
@@ -815,19 +549,9 @@ JL_DLLEXPORT jl_value_t *jl_threading_run(jl_value_t *_args)
     jl_callptr_t fptr = jl_compile_method_internal(&mfunc, world);
     if (fptr == jl_fptr_const_return)
         return jl_nothing;
-    return ti_run_fun(fptr, mfunc, args, nargs);
-}
-
-void jl_init_threading(void)
-{
-    static jl_ptls_t _jl_all_tls_states;
-    jl_all_tls_states = &_jl_all_tls_states;
-    jl_n_threads = 1;
-    ti_init_master_thread();
+    return jl_thread_run_fun(fptr, mfunc, args, nargs);
 }
 
-void jl_start_threads(void) { }
-
 #endif // !JULIA_ENABLE_THREADING
 
 // Make gc alignment available for threading
diff --git a/src/threading.h b/src/threading.h
index 8c812ca3c2676..a2b4501a56272 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -8,50 +8,40 @@
 extern "C" {
 #endif
 
-#include "threadgroup.h"
 #include "julia.h"
 
 #define PROFILE_JL_THREADING            0
 
-// thread ID
-extern jl_ptls_t *jl_all_tls_states;
-extern JL_DLLEXPORT int jl_n_threads;  // # threads we're actually using
-
-// thread state
-enum {
-    TI_THREAD_INIT,
-    TI_THREAD_WORK
-};
-
-// passed to thread function
-typedef struct {
-    int16_t volatile state;
-    int16_t          tid;
-    ti_threadgroup_t *tg;
-} ti_threadarg_t;
-
-// commands to thread function
-enum {
-    TI_THREADWORK_DONE,
-    TI_THREADWORK_RUN
-};
-
-// work command to thread function
-typedef struct {
-    uint8_t command;
-    jl_method_instance_t *mfunc;
-    jl_callptr_t fptr;
-    jl_value_t **args;
-    uint32_t nargs;
-    jl_value_t *ret;
-    size_t world_age;
-} ti_threadwork_t;
-
-// thread function
-void ti_threadfun(void *arg);
-
-// helpers for thread function
-jl_value_t *ti_runthread(jl_function_t *f, jl_svec_t *args, size_t nargs);
+extern jl_ptls_t *jl_all_tls_states;    /* thread local storage */
+extern JL_DLLEXPORT int jl_n_threads;   /* # threads we're actually using */
+
+typedef struct _jl_threadarg_t {
+    int16_t tid;
+    uv_barrier_t *barrier;
+    void *arg;
+} jl_threadarg_t;
+
+// each thread must initialize its TLS
+void jl_init_threadtls(int16_t tid);
+
+// generic helper for a thread to run a function
+jl_value_t *jl_thread_run_fun(jl_callptr_t fptr, jl_method_instance_t *mfunc,
+                              jl_value_t **args, uint32_t nargs);
+
+// provided by a threading infrastructure
+void jl_init_threadinginfra(void);
+void jl_init_threadarg(jl_threadarg_t *targ);
+void jl_init_started_threads(jl_threadarg_t **targs);
+void jl_threadfun(void *arg);
+
+// interfaces defined by threading infrastructures
+#ifdef JULIA_ENABLE_FORKJOIN_TI
+#include "forkjoin-ti.h"
+#else
+#ifdef JULIA_ENABLE_PARTR
+#include "partr.h"
+#endif
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl
index b82d66e685af9..ad6b6052905b5 100644
--- a/stdlib/Distributed/test/distributed_exec.jl
+++ b/stdlib/Distributed/test/distributed_exec.jl
@@ -756,11 +756,9 @@ function f13168(n)
     val
 end
 let t = schedule(@task f13168(100))
-    @test t.state == :queued
+    wait(t)
     @test_throws ErrorException schedule(t)
-    yield()
     @test t.state == :done
-    @test_throws ErrorException schedule(t)
     @test isa(fetch(t),Float64)
 end
 
diff --git a/stdlib/FileWatching/src/FileWatching.jl b/stdlib/FileWatching/src/FileWatching.jl
index 484540ba1c547..86aec8c997c84 100644
--- a/stdlib/FileWatching/src/FileWatching.jl
+++ b/stdlib/FileWatching/src/FileWatching.jl
@@ -338,7 +338,7 @@ function uv_pollcb(handle::Ptr{Cvoid}, status::Int32, events::Int32)
     else
         t.events |= events
         if t.active[1] || t.active[2]
-            if isempty(t.notify.waitq)
+            if isempty(t.notify)
                 # if we keep hearing about events when nobody appears to be listening,
                 # stop the poll to save cycles
                 t.active = (false, false)
@@ -400,7 +400,7 @@ function start_watching(t::PollingFileWatcher)
 end
 
 function stop_watching(t::PollingFileWatcher)
-    if t.active && isempty(t.notify.waitq)
+    if t.active && isempty(t.notify)
         t.active = false
         uv_error("PollingFileWatcher (stop)",
                  ccall(:uv_fs_poll_stop, Int32, (Ptr{Cvoid},), t.handle))
@@ -420,7 +420,7 @@ function start_watching(t::FileMonitor)
 end
 
 function stop_watching(t::FileMonitor)
-    if t.active && isempty(t.notify.waitq)
+    if t.active && isempty(t.notify)
         t.active = false
         uv_error("FileMonitor (stop)",
                  ccall(:uv_fs_event_stop, Int32, (Ptr{Cvoid},), t.handle))
diff --git a/stdlib/FileWatching/test/runtests.jl b/stdlib/FileWatching/test/runtests.jl
index 80af4d8e4b2fc..2c0a7fcbbf2dc 100644
--- a/stdlib/FileWatching/test/runtests.jl
+++ b/stdlib/FileWatching/test/runtests.jl
@@ -31,7 +31,7 @@ function pfd_tst_reads(idx, intvl)
     global ready += 1
     wait(ready_c)
     t_elapsed = @elapsed begin
-        start_evt2 = Condition()
+        start_evt2 = Threads.Event()
         evt2 = @async (notify(start_evt2); poll_fd(pipe_fds[idx][1], intvl; readable=true, writable=false))
         wait(start_evt2); yield() # make sure the async poll_fd is pumping events
         evt = poll_fd(pipe_fds[idx][1], intvl; readable=true, writable=false)
@@ -59,7 +59,7 @@ function pfd_tst_timeout(idx, intvl)
     global ready += 1
     wait(ready_c)
     t_elapsed = @elapsed begin
-        start_evt2 = Condition()
+        start_evt2 = Threads.Event()
         evt2 = @async (notify(start_evt2); poll_fd(pipe_fds[idx][1], intvl; readable=true, writable=false))
         wait(start_evt2); yield() # make sure the async poll_fd is pumping events
         evt = poll_fd(pipe_fds[idx][1], intvl; readable=true, writable=false)
@@ -384,7 +384,7 @@ mv(file * "~", file)
 let changes = []
     while true
         let c
-            timeout = Sys.iswindows() ? 0.1 : 0.0
+            timeout = 0.1
             @test @elapsed(c = watch_folder(dir, timeout)) < 0.5
             push!(changes, c)
             (c.second::FileWatching.FileEvent).timedout && break
diff --git a/stdlib/Sockets/src/addrinfo.jl b/stdlib/Sockets/src/addrinfo.jl
index 59e4f2dae1088..c484d41671a01 100644
--- a/stdlib/Sockets/src/addrinfo.jl
+++ b/stdlib/Sockets/src/addrinfo.jl
@@ -16,7 +16,7 @@ function uv_getaddrinfocb(req::Ptr{Cvoid}, status::Cint, addrinfo::Ptr{Cvoid})
         t = unsafe_pointer_to_objref(data)::Task
         uv_req_set_data(req, C_NULL)
         if status != 0 || addrinfo == C_NULL
-            schedule(t, _UVError("getaddrinfocb", status))
+            Base.schedule(t, _UVError("getaddrinfocb", status))
         else
             freeaddrinfo = addrinfo
             addrs = IPAddr[]
@@ -33,7 +33,7 @@ function uv_getaddrinfocb(req::Ptr{Cvoid}, status::Cint, addrinfo::Ptr{Cvoid})
                 addrinfo = ccall(:jl_next_from_addrinfo, Ptr{Cvoid}, (Ptr{Cvoid},), addrinfo)
             end
             ccall(:uv_freeaddrinfo, Cvoid, (Ptr{Cvoid},), freeaddrinfo)
-            schedule(t, addrs)
+            Base.schedule(t, addrs)
         end
     else
         # no owner for this req, safe to just free it
@@ -129,9 +129,9 @@ function uv_getnameinfocb(req::Ptr{Cvoid}, status::Cint, hostname::Cstring, serv
         t = unsafe_pointer_to_objref(data)::Task
         uv_req_set_data(req, C_NULL)
         if status != 0
-            schedule(t, _UVError("getnameinfocb", status))
+            Base.schedule(t, _UVError("getnameinfocb", status))
         else
-            schedule(t, unsafe_string(hostname))
+            Base.schedule(t, unsafe_string(hostname))
         end
     else
         # no owner for this req, safe to just free it
diff --git a/stdlib/Sockets/test/runtests.jl b/stdlib/Sockets/test/runtests.jl
index 6065debb2b8b7..0976d1e05c446 100644
--- a/stdlib/Sockets/test/runtests.jl
+++ b/stdlib/Sockets/test/runtests.jl
@@ -139,7 +139,7 @@ defaultport = rand(2000:4000)
 
     mktempdir() do tmpdir
         socketname = Sys.iswindows() ? ("\\\\.\\pipe\\uv-test-" * randstring(6)) : joinpath(tmpdir, "socket")
-        c = Condition()
+        c = Threads.Event()
         tsk = @async begin
             s = listen(socketname)
             notify(c)
@@ -415,7 +415,7 @@ end
 
     let addr = Sockets.InetAddr(ip"127.0.0.1", 4444)
         srv = listen(addr)
-        r = @async close(srv)
+        r = @async (sleep(1); close(srv))
         @test_throws Base._UVError("accept", Base.UV_ECONNABORTED) accept(srv)
         fetch(r)
     end
@@ -424,7 +424,7 @@ end
         srv = listen(addr)
         s = Sockets.TCPSocket()
         Sockets.connect!(s, addr)
-        r = @async close(s)
+        r = @async (sleep(1); close(s))
         @test_throws Base._UVError("connect", Base.UV_ECANCELED) Sockets.wait_connected(s)
         fetch(r)
     end
diff --git a/test/channels.jl b/test/channels.jl
index a2dcf2c4ea2cf..c2ff40cb80b7e 100644
--- a/test/channels.jl
+++ b/test/channels.jl
@@ -1,6 +1,6 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-using Random
+using Random, Test
 
 @testset "various constructors" begin
     c = Channel(1)
@@ -63,10 +63,11 @@ end
                 push!(results, ii)
             end
         end
-        sleep(1.0)
+        sleep(0.2)
         for i in 1:5
             put!(c,i)
         end
+        sleep(0.2)
         close(c)
     end
     @test sum(results) == 15
@@ -135,7 +136,7 @@ using Distributed
 
     # channeled_tasks
     for T in [Any, Int]
-        chnls, tasks = Base.channeled_tasks(2, (c1,c2)->(@assert take!(c1)==1; put!(c2,2)); ctypes=[T,T], csizes=[N,N])
+        chnls, tasks = Base.channeled_tasks(2, (c1,c2)->(@assert take!(c1)==1; put!(c2,2); sleep(0.2)); ctypes=[T,T], csizes=[N,N])
         put!(chnls[1], 1)
         @test take!(chnls[2]) == 2
         @test_throws InvalidStateException wait(chnls[1])
@@ -248,6 +249,7 @@ end
         error in running finalizer: ErrorException("task switch not allowed from inside gc finalizer")
         error in running finalizer: ErrorException("task switch not allowed from inside gc finalizer")
         """
+#= TODO: there's no Workqueue any more
     # test for invalid state in Workqueue during yield
     t = @async nothing
     t.state = :invalid
@@ -260,24 +262,16 @@ end
         close(newstderr[2])
     end
     @test fetch(errstream) == "\nWARNING: Workqueue inconsistency detected: popfirst!(Workqueue).state != :queued\n"
-end
-
-@testset "schedule_and_wait" begin
-    t = @async(nothing)
-    ct = current_task()
-    testobject = "testobject"
-    # note: there is a low probability this test could fail, due to receiving network traffic simultaneously
-    @test length(Base.Workqueue) == 1
-    @test Base.schedule_and_wait(ct, 8) == 8
-    @test isempty(Base.Workqueue)
-    @test Base.schedule_and_wait(ct, testobject) === testobject
+=#
 end
 
 @testset "throwto" begin
     t = @task(nothing)
     ct = current_task()
     testerr = ErrorException("expected")
-    @async Base.throwto(t, testerr)
+    # TODO: throwto() is unimplemented
+    #@async Base.throwto(t, testerr)
+    @async schedule(t, testerr, error=true)
     @test try
         Base.wait(t)
         false
@@ -286,26 +280,26 @@ end
     end === testerr
 end
 
+#= TODO: these tests depend on task execution ordering and that makes no
+# sense with threads!
+=#
 @testset "Timer / AsyncCondition triggering and race #12719" begin
     tc = Ref(0)
     t = Timer(0) do t
         tc[] += 1
     end
-    @test isopen(t)
     Base.process_events(false)
-    @test !isopen(t)
-    @test tc[] == 0
     yield()
+    @test !isopen(t)
     @test tc[] == 1
 
     tc = Ref(0)
-    t = Timer(0) do t
+    t = Timer(10) do t
         tc[] += 1
     end
     @test isopen(t)
     close(t)
     @test !isopen(t)
-    sleep(0.1)
     @test tc[] == 0
 
     tc = Ref(0)
@@ -320,8 +314,10 @@ end
     @test tc[] == 0
     yield() # consume event
     @test tc[] == 1
-    sleep(0.1) # no further events
-    @test tc[] == 1
+    # NOTE: this depended on the scheduler not calling process_events when there
+    # are tasks to run. Now, this is probabilistic.
+    #sleep(0.1) # no further events
+    #@test tc[] == 1
     ccall(:uv_async_send, Cvoid, (Ptr{Cvoid},), async)
     ccall(:uv_async_send, Cvoid, (Ptr{Cvoid},), async)
     close(async)
diff --git a/test/file.jl b/test/file.jl
index 4bdb0a2ffb2ee..579dce536462d 100644
--- a/test/file.jl
+++ b/test/file.jl
@@ -974,7 +974,7 @@ cd(dirwalk) do
     @test files == ["file1", "file2"]
 
     rm(joinpath("sub_dir1"), recursive=true)
-    @test_throws SystemError take!(chnl_error) # throws an error because sub_dir1 do not exist
+    @test_throws SystemError collect(chnl_error) # throws an error because sub_dir1 do not exist
 
     root, dirs, files = take!(chnl_noerror)
     @test root == "."
diff --git a/test/misc.jl b/test/misc.jl
index defcba93c082a..cb4ca7a402af7 100644
--- a/test/misc.jl
+++ b/test/misc.jl
@@ -126,6 +126,7 @@ let c = Ref(0),
     yield()
     @test c[] == 1
     yield(t2)
+    wait(t2)
     @test c[] == 100
 end
 
diff --git a/test/read.jl b/test/read.jl
index 944b911748b63..4ae3d0070c7dd 100644
--- a/test/read.jl
+++ b/test/read.jl
@@ -559,8 +559,9 @@ let p = Pipe()
     t = @async read(p)
     @sync begin
         @async write(p, zeros(UInt16, 660_000))
+        order::UInt16 = 0
         for i = 1:typemax(UInt16)
-            @async write(p, UInt16(i))
+            @async (order+=1; write(p, order))
         end
         @async close(p.in)
     end
diff --git a/test/spawn.jl b/test/spawn.jl
index 0cfc23a8a02df..c166268f6976e 100644
--- a/test/spawn.jl
+++ b/test/spawn.jl
@@ -58,13 +58,13 @@ out = read(`$echocmd hello` & `$echocmd world`, String)
 Sys.isunix() && run(pipeline(yescmd, `head`, devnull))
 
 let a, p
-    a = Base.Condition()
+    a = Channel(0)
     t = @async begin
         p = run(pipeline(yescmd,devnull), wait=false)
-        Base.notify(a,p)
+        put!(a, p)
         @test !success(p)
     end
-    p = wait(a)
+    p = take!(a)
     kill(p)
     wait(t)
 end
diff --git a/test/threads.jl b/test/threads.jl
index 7b79b141b660b..ee2ca9d127ce6 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -369,7 +369,7 @@ using Dates
 for period in (0.06, Dates.Millisecond(60))
     let async = Base.AsyncCondition(), t
         c = Condition()
-        task = schedule(Task(function()
+        task = Base.schedule(Task(function()
             notify(c)
             wait(c)
             t = Timer(period)

From 6708d27df3e7bf2dd10c9e1473ba2d90a2862fd8 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Fri, 9 Nov 2018 16:23:11 -0500
Subject: [PATCH 2/4] revert invalid changes

---
 stdlib/Sockets/test/runtests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stdlib/Sockets/test/runtests.jl b/stdlib/Sockets/test/runtests.jl
index 0976d1e05c446..5d4d09e531bd5 100644
--- a/stdlib/Sockets/test/runtests.jl
+++ b/stdlib/Sockets/test/runtests.jl
@@ -415,7 +415,7 @@ end
 
     let addr = Sockets.InetAddr(ip"127.0.0.1", 4444)
         srv = listen(addr)
-        r = @async (sleep(1); close(srv))
+        r = @async close(srv)
         @test_throws Base._UVError("accept", Base.UV_ECONNABORTED) accept(srv)
         fetch(r)
     end
@@ -424,7 +424,7 @@ end
         srv = listen(addr)
         s = Sockets.TCPSocket()
         Sockets.connect!(s, addr)
-        r = @async (sleep(1); close(s))
+        r = @async close(s)
         @test_throws Base._UVError("connect", Base.UV_ECANCELED) Sockets.wait_connected(s)
         fetch(r)
     end

From 8714c98586aaa7e30522925713f2fbec6d883ffe Mon Sep 17 00:00:00 2001
From: Jeff Bezanson <jeff.bezanson@gmail.com>
Date: Wed, 5 Dec 2018 14:51:11 -0500
Subject: [PATCH 3/4] try enabling again

---
 Make.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Make.inc b/Make.inc
index 19e6265b65b3e..e89d6614dbe17 100644
--- a/Make.inc
+++ b/Make.inc
@@ -69,7 +69,7 @@ USEIFC  ?= 0
 JULIA_THREADS := 1
 
 # Enable the parallel task runtime
-JULIA_PARTR ?= 0
+JULIA_PARTR ?= 1
 ifeq ($(JULIA_THREADS), 0)
 JULIA_PARTR := 0
 endif

From 70b4852f374cc21e29af0ba133e8924d6ae694b0 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 5 Dec 2018 21:54:31 +0000
Subject: [PATCH 4/4] fix behavior divergence with non-partr build

---
 src/partr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/partr.c b/src/partr.c
index 8b8672eb56089..573971aa5c2bd 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -710,6 +710,8 @@ static int run_next(void)
     }
 
     jl_switchto(&task);
+    if (ptls->tid == 0)
+        jl_process_events(jl_global_event_loop());
 
     JL_GC_POP();
     return 1;