xnu-1228.15.4

Imported from https://opensource.apple.com/tarballs/xnu/xnu-1228.15.4.tar.gz
hgdwwq · Jun 4, 2017 · 1c19685 · 1c19685
1 parent 4a63d54
commit 1c19685
Show file tree

Hide file tree

Showing 28 changed files with 842 additions and 72 deletions.
diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h
@@ -46,6 +46,7 @@
 #include <sys/quota.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
+#include <kern/thread_call.h>
 
 #include <kern/locks.h>
 
@@ -272,8 +273,34 @@ typedef struct hfsmount {
 	/* Resize variables: */
 	u_int32_t		hfs_resize_filesmoved;
 	u_int32_t		hfs_resize_totalfiles;
+
+	/*
+	 * About the sync counters:
+	 * hfs_sync_scheduled  keeps track whether a timer was scheduled but we
+	 *                     haven't started processing the callback (i.e. we
+	 *                     haven't begun the flush).  This will be non-zero
+	 *                     even if the callback has been invoked, before we
+	 *                    start the flush.
+	 * hfs_sync_incomplete keeps track of the number of callbacks that have
+	 *                     not completed yet (including callbacks not yet
+	 *                     invoked).  We cannot safely unmount until this
+	 *                     drops to zero.
+	 *
+	 * In both cases, we use counters, not flags, so that we can avoid
+	 * taking locks.
+	 */
+	int32_t		hfs_sync_scheduled;
+	int32_t		hfs_sync_incomplete;
+	u_int64_t       hfs_last_sync_request_time;
+	u_int64_t       hfs_last_sync_time;
+	uint32_t        hfs_active_threads;
+	thread_call_t   hfs_syncer;	      // removeable devices get sync'ed by this guy
+
 } hfsmount_t;
 
+#define HFS_META_DELAY     (100)
+#define HFS_MILLISEC_SCALE (1000*1000)
+
 typedef hfsmount_t  ExtendedVCB;
 
 /* Aliases for legacy (Mac OS 9) field names */
@@ -689,6 +716,7 @@ extern int  hfs_virtualmetafile(struct cnode *);
 
 extern int hfs_start_transaction(struct hfsmount *hfsmp);
 extern int hfs_end_transaction(struct hfsmount *hfsmp);
+extern void hfs_sync_ejectable(struct hfsmount *hfsmp);
 
 
 /*****************************************************************************

diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c
@@ -1836,12 +1836,20 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 	}
 
 	case HFS_GET_MOUNT_TIME:
-	    return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time));
-	    break;
+	    if (is64bit) {
+	    	*(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_mount_time;
+	    } else {
+	    	*(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_mount_time;
+	    }
+		return 0;
 
 	case HFS_GET_LAST_MTIME:
-	    return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime));
-	    break;
+	    if (is64bit) {
+	    	*(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_last_mounted_mtime;
+	    } else {
+	    	*(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_last_mounted_mtime;
+	    }
+		return 0;
 
 	case HFS_SET_BOOT_INFO:
 		if (!vnode_isvroot(vp))

diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c
@@ -827,6 +827,99 @@ hfs_reload(struct mount *mountp)
 	return (0);
 }
 
+int hfs_last_io_wait_time = 125000;
+SYSCTL_INT (_kern, OID_AUTO, hfs_last_io_wait_time, CTLFLAG_RW, &hfs_last_io_wait_time, 0, "number of usecs to wait after an i/o before syncing ejectable media");
+
+static void
+hfs_syncer(void *arg0, void *unused)
+{
+#pragma unused(unused)
+
+    struct hfsmount *hfsmp = arg0;
+    uint32_t secs, usecs, delay = HFS_META_DELAY;
+    uint64_t now;
+    struct timeval nowtv, last_io;
+
+    clock_get_calendar_microtime(&secs, &usecs);
+    now = ((uint64_t)secs * 1000000LL) + usecs;
+    //
+    // If we have put off the last sync for more than
+    // 5 seconds, force it so that we don't let too
+    // much i/o queue up (since flushing the journal
+    // causes the i/o queue to drain)
+    //
+    if ((now - hfsmp->hfs_last_sync_time) >= 5000000LL) {
+	    goto doit;
+    }
+
+    //
+    // Find out when the last i/o was done to this device (read or write).  
+    //
+    throttle_info_get_last_io_time(hfsmp->hfs_mp, &last_io);
+    microuptime(&nowtv);
+    timevalsub(&nowtv, &last_io);
+
+    //
+    // If the last i/o was too recent, defer this sync until later.
+    // The limit chosen (125 milli-seconds) was picked based on
+    // some experiments copying data to an SD card and seems to
+    // prevent us from issuing too many syncs.
+    //
+    if (nowtv.tv_sec >= 0 && nowtv.tv_usec > 0 && nowtv.tv_usec < hfs_last_io_wait_time) {
+	    delay /= 2;
+	    goto resched;
+    }
+
+    //
+    // If there's pending i/o, also skip the sync.
+    //
+    if (hfsmp->hfs_devvp && hfsmp->hfs_devvp->v_numoutput > 0) {
+	    goto resched;
+    }
+
+
+    //
+    // Only flush the journal if we have not sync'ed recently
+    // and the last sync request time was more than 100 milli
+    // seconds ago and there is no one in the middle of a
+    // transaction right now.  Else we defer the sync and
+    // reschedule it for later.
+    //
+    if (  ((now - hfsmp->hfs_last_sync_time) >= 100000LL)
+       && ((now - hfsmp->hfs_last_sync_request_time) >= 100000LL)
+       && (hfsmp->hfs_active_threads == 0)
+       && (hfsmp->hfs_global_lock_nesting == 0)) {
+
+    doit:
+	    OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads);
+	    if (hfsmp->jnl) {
+		    journal_flush(hfsmp->jnl);
+	    }
+	    OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads);
+
+	    clock_get_calendar_microtime(&secs, &usecs);
+	    hfsmp->hfs_last_sync_time = ((int64_t)secs * 1000000) + usecs;
+
+    } else if (hfsmp->hfs_active_threads == 0) {
+	    uint64_t deadline;
+
+    resched:
+	    clock_interval_to_deadline(delay, HFS_MILLISEC_SCALE, &deadline);
+	    thread_call_enter_delayed(hfsmp->hfs_syncer, deadline);
+	    return;
+    }
+
+    //
+    // NOTE: we decrement these *after* we're done the journal_flush() since
+    // it can take a significant amount of time and so we don't want more
+    // callbacks scheduled until we're done this one.
+    //
+    OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
+    OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
+    wakeup((caddr_t)&hfsmp->hfs_sync_incomplete);
+}
+
+extern int IOBSDIsMediaEjectable( const char *cdev_name );
 
 /*
  * Common code for mount and mountroot
@@ -855,12 +948,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
 	u_int32_t iswritable;
 	daddr64_t mdb_offset;
 	int isvirtual = 0;
+	int isroot = 0;
 
 	ronly = vfs_isrdonly(mp);
 	dev = vnode_specrdev(devvp);
 	cred = p ? vfs_context_ucred(context) : NOCRED;
 	mntwrapper = 0;
 
+	if (args == NULL) {
+		/* only hfs_mountroot passes us NULL as the 'args' argument */
+		isroot = 1;	
+	}
+
 	bp = NULL;
 	hfsmp = NULL;
 	mdbp = NULL;
@@ -1379,6 +1478,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
 		}
 	}
 
+	/* ejectability checks will time out when the device is root_device, so skip them */
+	if (isroot == 0) {
+		if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 && 
+				IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
+			hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
+			if (hfsmp->hfs_syncer == NULL) {
+				printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
+						mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
+			}
+		}
+	}
+
 	/*
 	 * Start looking for free space to drop below this level and generate a
 	 * warning immediately if needed:
@@ -1451,6 +1562,38 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
 	if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
 		(void) hfs_recording_suspend(hfsmp);
 
+	/*
+	 * Cancel any pending timers for this volume.  Then wait for any timers
+	 * which have fired, but whose callbacks have not yet completed.
+	 */
+	if (hfsmp->hfs_syncer)
+	{
+		struct timespec ts = {0, 100000000};	/* 0.1 seconds */
+
+		/*
+		 * Cancel any timers that have been scheduled, but have not
+		 * fired yet.  NOTE: The kernel considers a timer complete as
+		 * soon as it starts your callback, so the kernel does not
+		 * keep track of the number of callbacks in progress.
+		 */
+		if (thread_call_cancel(hfsmp->hfs_syncer))
+			OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
+		thread_call_free(hfsmp->hfs_syncer);
+		hfsmp->hfs_syncer = NULL;
+
+		/*
+		 * This waits for all of the callbacks that were entered before
+		 * we did thread_call_cancel above, but have not completed yet.
+		 */
+		while(hfsmp->hfs_sync_incomplete > 0)
+		{
+			msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts);
+		}
+
+		if (hfsmp->hfs_sync_incomplete < 0)
+			printf("hfs_unmount: pm_sync_incomplete underflow (%d)!\n", hfsmp->hfs_sync_incomplete);
+	}
+
 	/*
 	 * Flush out the b-trees, volume bitmap and Volume Header
 	 */
@@ -1931,6 +2074,15 @@ hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
 	    journal_flush(hfsmp->jnl);
 	}
 
+	{
+		uint32_t secs, usecs;
+		uint64_t now;
+
+		clock_get_calendar_microtime(&secs, &usecs);
+		now = ((uint64_t)secs * 1000000LL) + usecs;
+		hfsmp->hfs_last_sync_time = now;
+	}
+
 	lck_rw_unlock_shared(&hfsmp->hfs_insync);	
 	return (allerror);
 }

diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c
@@ -2347,6 +2347,46 @@ hfs_virtualmetafile(struct cnode *cp)
 }
 
 
+
+//
+// Fire off a timed callback to sync the disk if the
+// volume is on ejectable media.
+//
+ __private_extern__
+void
+hfs_sync_ejectable(struct hfsmount *hfsmp)
+{
+	if (hfsmp->hfs_syncer)	{
+		uint32_t secs, usecs;
+		uint64_t now;
+
+		clock_get_calendar_microtime(&secs, &usecs);
+		now = ((uint64_t)secs * 1000000) + usecs;
+
+		if (hfsmp->hfs_sync_scheduled == 0) {
+			uint64_t deadline;
+
+			hfsmp->hfs_last_sync_request_time = now;
+
+			clock_interval_to_deadline(HFS_META_DELAY, HFS_MILLISEC_SCALE, &deadline);
+
+			/*
+			 * Increment hfs_sync_scheduled on the assumption that we're the
+			 * first thread to schedule the timer.  If some other thread beat
+			 * us, then we'll decrement it.  If we *were* the first to
+			 * schedule the timer, then we need to keep track that the
+			 * callback is waiting to complete.
+			 */
+			OSIncrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
+			if (thread_call_enter_delayed(hfsmp->hfs_syncer, deadline))
+				OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
+			else
+				OSIncrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
+		}		
+	}
+}
+
+
 __private_extern__
 int
 hfs_start_transaction(struct hfsmount *hfsmp)
@@ -2374,6 +2414,7 @@ hfs_start_transaction(struct hfsmount *hfsmp)
 
     if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) {
 	lck_rw_lock_shared(&hfsmp->hfs_global_lock);
+	OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads);
 	unlock_on_err = 1;
     }
 
@@ -2399,6 +2440,7 @@ hfs_start_transaction(struct hfsmount *hfsmp)
 out:
     if (ret != 0 && unlock_on_err) {
 	lck_rw_unlock_shared(&hfsmp->hfs_global_lock);
+	OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads);
     }
 
     return ret;
@@ -2424,7 +2466,9 @@ hfs_end_transaction(struct hfsmount *hfsmp)
     }
 
     if (need_unlock) {
+	OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads);
 	lck_rw_unlock_shared(&hfsmp->hfs_global_lock);
+	hfs_sync_ejectable(hfsmp);
     }
 
     return ret;

diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c
@@ -367,6 +367,11 @@ hfs_vnop_close(ap)
 	}
 
 	hfs_unlock(cp);
+
+	if (ap->a_fflag & FWASWRITTEN) {
+		hfs_sync_ejectable(hfsmp);
+	}
+
 	return (0);
 }
 
@@ -2619,13 +2624,26 @@ hfs_vnop_rename(ap)
 skip_rm:
 	/*
 	 * All done with tvp and fvp
+	 *
+	 * We also jump to this point if there was no destination observed during lookup and namei.
+	 * However, because only iocounts are held at the VFS layer, there is nothing preventing a 
+	 * competing thread from racing us and creating a file or dir at the destination of this rename 
+	 * operation.  If this occurs, it may cause us to get a spurious EEXIST out of the cat_rename 
+	 * call below.  To preserve rename's atomicity, we need to signal VFS to re-drive the 
+	 * namei/lookup and restart the rename operation.  EEXIST is an allowable errno to be bubbled 
+	 * out of the rename syscall, but not for this reason, since it is a synonym errno for ENOTEMPTY.
+	 * To signal VFS, we return ERECYCLE (which is also used for lookup restarts). This errno
+	 * will be swallowed and it will restart the operation.
 	 */
 
 	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
 	error = cat_rename(hfsmp, &from_desc, &tdcp->c_desc, &to_desc, &out_desc);
 	hfs_systemfile_unlock(hfsmp, lockflags);
 
 	if (error) {
+		if (error == EEXIST) {
+			error = ERECYCLE;
+		}
 		goto out;
 	}