Discussion:
[PATCH v4 0/1] nilfs2: add missing blkdev_issue_flush() to
Andreas Rohner
2014-09-09 21:17:08 UTC
Permalink
Hi,

I have looked a bit more into the semantics of the various flags
concerning block device caching behaviour. According to
"Documentation/block/writeback_cache_control.txt" a call to
blkdev_issue_flush() is equivalent to an empty bio with the
REQ_FLUSH flag set. So there is no need to call blkdev_issue_flush()
after a call to nilfs_commit_super(). But if there is no need to write
the super block an additional call to blkdev_issue_flush() is necessary.

To avoid an overhead I introduced the nilfs->ns_flushed_device flag,
which is set to 0 whenever new logs are written and set to 1 whenever
the block device is flushed. If the super block was written during
segment construction or in nilfs_sync_fs(), then blkdev_issue_flush() is
not called.

br,
Andreas Rohner

v3->v4 (review by Ryusuke Konishi)
* replace atomic_t with int for ns_flushed_device
* use smp_wmb() to guarantee correct ordering

v2->v3 (review of Ryusuke Konishi)
* Use separate atomic flag for ns_flushed_device instead of a bit flag
in ns_flags
* Use smp_mb__after_atomic() after setting ns_flushed_device

v1->v2
* Add new flag THE_NILFS_FLUSHED

Andreas Rohner (1):
nilfs2: add missing blkdev_issue_flush() to nilfs_sync_fs()

fs/nilfs2/file.c | 10 +++++++++-
fs/nilfs2/ioctl.c | 10 +++++++++-
fs/nilfs2/segment.c | 4 ++++
fs/nilfs2/super.c | 17 +++++++++++++++++
fs/nilfs2/the_nilfs.h | 2 ++
5 files changed, 41 insertions(+), 2 deletions(-)
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-09-09 21:17:09 UTC
Permalink
Under normal circumstances nilfs_sync_fs() writes out the super block,
which causes a flush of the underlying block device. But this depends on
the THE_NILFS_SB_DIRTY flag, which is only set if the pointer to the
last segment crosses a segment boundary. So if only a small amount of
data is written before the call to nilfs_sync_fs(), no flush of the
block device occurs.

In the above case an additional call to blkdev_issue_flush() is needed.
To prevent unnecessary overhead, the new flag nilfs->ns_flushed_device
is introduced, which is cleared whenever new logs are written and set
whenever the block device is flushed.

Signed-off-by: Andreas Rohner <andreas.rohner-***@public.gmane.org>
---
fs/nilfs2/file.c | 10 +++++++++-
fs/nilfs2/ioctl.c | 10 +++++++++-
fs/nilfs2/segment.c | 4 ++++
fs/nilfs2/super.c | 17 +++++++++++++++++
fs/nilfs2/the_nilfs.h | 2 ++
5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2497815..16375c2 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -56,7 +56,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);

nilfs = inode->i_sb->s_fs_info;
- if (!err && nilfs_test_opt(nilfs, BARRIER)) {
+ if (!err && nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (err != -EIO)
err = 0;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 422fb54..9444d5d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1022,7 +1022,15 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return ret;

nilfs = inode->i_sb->s_fs_info;
- if (nilfs_test_opt(nilfs, BARRIER)) {
+ if (nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (ret == -EIO)
return ret;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a1916..379da1b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1997,6 +1997,10 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
err = nilfs_segctor_wait(sci);
if (err)
goto failed_to_write;
+
+ if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) ||
+ mode == SC_LSEG_DSYNC)
+ nilfs->ns_flushed_device = 0;
}
} while (sci->sc_stage.scnt != NILFS_ST_DONE);

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 228f5bd..33aafbd 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -310,6 +310,9 @@ int nilfs_commit_super(struct super_block *sb, int flag)
nilfs->ns_sbsize));
}
clear_nilfs_sb_dirty(nilfs);
+ nilfs->ns_flushed_device = 1;
+ /* make sure store to ns_flushed_device cannot be reordered */
+ smp_wmb();
return nilfs_sync_super(sb, flag);
}

@@ -514,6 +517,20 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
}
up_write(&nilfs->ns_sem);

+ if (wait && !err && nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (err != -EIO)
+ err = 0;
+ }
+
return err;
}

diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index d01ead1..dabb02c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -45,6 +45,7 @@ enum {

/**
* struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flushed_device: flag indicating if all volatile data was flushed
* @ns_flags: flags
* @ns_bdev: block device
* @ns_sem: semaphore for shared states
@@ -103,6 +104,7 @@ enum {
*/
struct the_nilfs {
unsigned long ns_flags;
+ int ns_flushed_device;

struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-09-10 09:22:39 UTC
Permalink
Post by Andreas Rohner
Under normal circumstances nilfs_sync_fs() writes out the super block,
which causes a flush of the underlying block device. But this depends on
the THE_NILFS_SB_DIRTY flag, which is only set if the pointer to the
last segment crosses a segment boundary. So if only a small amount of
data is written before the call to nilfs_sync_fs(), no flush of the
block device occurs.
In the above case an additional call to blkdev_issue_flush() is needed.
To prevent unnecessary overhead, the new flag nilfs->ns_flushed_device
is introduced, which is cleared whenever new logs are written and set
whenever the block device is flushed.
---
fs/nilfs2/file.c | 10 +++++++++-
fs/nilfs2/ioctl.c | 10 +++++++++-
fs/nilfs2/segment.c | 4 ++++
fs/nilfs2/super.c | 17 +++++++++++++++++
fs/nilfs2/the_nilfs.h | 2 ++
5 files changed, 41 insertions(+), 2 deletions(-)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2497815..16375c2 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -56,7 +56,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);
nilfs = inode->i_sb->s_fs_info;
- if (!err && nilfs_test_opt(nilfs, BARRIER)) {
+ if (!err && nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (err != -EIO)
err = 0;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 422fb54..9444d5d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1022,7 +1022,15 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return ret;
nilfs = inode->i_sb->s_fs_info;
- if (nilfs_test_opt(nilfs, BARRIER)) {
+ if (nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (ret == -EIO)
return ret;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a1916..379da1b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1997,6 +1997,10 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
err = nilfs_segctor_wait(sci);
if (err)
goto failed_to_write;
+
+ if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) ||
+ mode == SC_LSEG_DSYNC)
+ nilfs->ns_flushed_device = 0;
}
} while (sci->sc_stage.scnt != NILFS_ST_DONE);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 228f5bd..33aafbd 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -310,6 +310,9 @@ int nilfs_commit_super(struct super_block *sb, int flag)
nilfs->ns_sbsize));
}
clear_nilfs_sb_dirty(nilfs);
+ nilfs->ns_flushed_device = 1;
+ /* make sure store to ns_flushed_device cannot be reordered */
+ smp_wmb();
return nilfs_sync_super(sb, flag);
}
@@ -514,6 +517,20 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
}
up_write(&nilfs->ns_sem);
+ if (wait && !err && nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
I am not at all sure if this memory barrier is enough. Memory barriers
only guarantee the order in which memory operations hit the CPU cache.
They do not guarantee that all CPUSs see the previous memory operations.
They cannot be used to provide unconditional ordering. I am not even
sure if this can be done without proper locks, but I am not an expert in
lock-free algorithms.

Maybe the safest way would be to use nilfs->ns_sem around the whole if
statement.
Post by Andreas Rohner
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (err != -EIO)
+ err = 0;
+ }
+
return err;
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index d01ead1..dabb02c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -45,6 +45,7 @@ enum {
/**
* struct the_nilfs - struct to supervise multiple nilfs mount points
@@ -103,6 +104,7 @@ enum {
*/
struct the_nilfs {
unsigned long ns_flags;
+ int ns_flushed_device;
struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-09-13 11:35:45 UTC
Permalink
Post by Andreas Rohner
Under normal circumstances nilfs_sync_fs() writes out the super block,
which causes a flush of the underlying block device. But this depends on
the THE_NILFS_SB_DIRTY flag, which is only set if the pointer to the
last segment crosses a segment boundary. So if only a small amount of
data is written before the call to nilfs_sync_fs(), no flush of the
block device occurs.
In the above case an additional call to blkdev_issue_flush() is needed.
To prevent unnecessary overhead, the new flag nilfs->ns_flushed_device
is introduced, which is cleared whenever new logs are written and set
whenever the block device is flushed.
---
fs/nilfs2/file.c | 10 +++++++++-
fs/nilfs2/ioctl.c | 10 +++++++++-
fs/nilfs2/segment.c | 4 ++++
fs/nilfs2/super.c | 17 +++++++++++++++++
fs/nilfs2/the_nilfs.h | 2 ++
5 files changed, 41 insertions(+), 2 deletions(-)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2497815..16375c2 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -56,7 +56,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);
nilfs = inode->i_sb->s_fs_info;
- if (!err && nilfs_test_opt(nilfs, BARRIER)) {
+ if (!err && nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (err != -EIO)
err = 0;
Looks good. But, these code lines and the comment appear repeatedly.
To simplify these, I propose to add the following inline function to
the_nilfs.h.

@@ -371,4 +373,24 @@ static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
}

+static inline int nilfs_flush_device(struct the_nilfs *nilfs)
+{
+ int err;
+
+ if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device)
+ return 0;
+
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush().
+ */
+ smp_wmb();
+
+ err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL);
+ if (err != -EIO)
+ err = 0;
+ return err;
+}
+
#endif /* _THE_NILFS_H */
Post by Andreas Rohner
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 422fb54..9444d5d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1022,7 +1022,15 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return ret;
nilfs = inode->i_sb->s_fs_info;
- if (nilfs_test_opt(nilfs, BARRIER)) {
+ if (nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (ret == -EIO)
return ret;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a1916..379da1b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1997,6 +1997,10 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
err = nilfs_segctor_wait(sci);
if (err)
goto failed_to_write;
+
+ if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) ||
+ mode == SC_LSEG_DSYNC)
+ nilfs->ns_flushed_device = 0;
}
} while (sci->sc_stage.scnt != NILFS_ST_DONE);
We can simplify this by inserting "nilfs->ns_flushed_device = 0" in
nilfs_segctor_complete_write() and nilfs_construct_dsync_segment()
separately as follows. Explicit memory barriers are not required in
the following changes because both nilfs_set_last_segment() and
nilfs_transaction_unlock() imply a memory barrier.

--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1833,6 +1833,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
nilfs_set_next_segment(nilfs, segbuf);

if (update_sr) {
+ nilfs->ns_flushed_device = 0;
nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
segbuf->sb_sum.seg_seq, nilfs->ns_cno++);

@@ -2216,6 +2217,8 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
sci->sc_dsync_end = end;

err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+ if (!err)
+ nilfs->ns_flushed_device = 0;

nilfs_transaction_unlock(sb);
return err;
Post by Andreas Rohner
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 228f5bd..33aafbd 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -310,6 +310,9 @@ int nilfs_commit_super(struct super_block *sb, int flag)
nilfs->ns_sbsize));
}
clear_nilfs_sb_dirty(nilfs);
+ nilfs->ns_flushed_device = 1;
+ /* make sure store to ns_flushed_device cannot be reordered */
+ smp_wmb();
return nilfs_sync_super(sb, flag);
}
@@ -514,6 +517,20 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
}
up_write(&nilfs->ns_sem);
+ if (wait && !err && nilfs_test_opt(nilfs, BARRIER) &&
+ !nilfs->ns_flushed_device) {
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush
+ */
+ smp_wmb();
+
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (err != -EIO)
+ err = 0;
+ }
+
return err;
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index d01ead1..dabb02c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -45,6 +45,7 @@ enum {
/**
* struct the_nilfs - struct to supervise multiple nilfs mount points
ns_flushed_device is inserted after ns_flags, so these two comment lines
should be swapped.
Post by Andreas Rohner
@@ -103,6 +104,7 @@ enum {
*/
struct the_nilfs {
unsigned long ns_flags;
+ int ns_flushed_device;
struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
--
2.1.0
Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Loading...