Discussion:
[PATCH 0/2] nilfs2: add support for FITRIM ioctl
Andreas Rohner
2014-02-15 13:34:24 UTC
Permalink
Hi,

This patch adds support for the FITRIM ioctl, which allows user space
tools like fstrim to issue TRIM/DISCARD requests to the underlying
device. It takes a fstrim_range structure as a parameter and for every
clean segment in the specified range the function blkdev_issue_discard
is called.

Best regards,
Andreas Rohner

---
Andreas Rohner (2):
nilfs2: add nilfs_sufile_trim_fs to trim clean segs
nilfs2: add FITRIM ioctl support for nilfs2

fs/nilfs2/ioctl.c | 46 +++++++++++++++++++++
fs/nilfs2/sufile.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
3 files changed, 161 insertions(+)
--
1.8.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-15 13:34:25 UTC
Permalink
This patch adds the nilfs_sufile_trim_fs function, which takes a
fstrim_range structure and calls blkdev_issue_discard for every
clean segment in the specified range.

Signed-off-by: Andreas Rohner <andreas.rohner-***@public.gmane.org>
---
fs/nilfs2/sufile.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
2 files changed, 115 insertions(+)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f..022b17f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,120 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
}

/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ * @sufile: inode of segment usage file
+ * @range: fstrim_range structure
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. For each clean segment blkdev_issue_discard
+ * function is invoked to trim it.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+ ssize_t n;
+ sector_t seg_start, seg_end, start = 0, nblocks = 0;
+ u64 segnum, end, minlen, trimmed = 0;
+ int i, ret = 0;
+ unsigned long segusages_per_block;
+ unsigned int sects_per_block;
+
+ segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+ bdev_logical_block_size(nilfs->ns_bdev);
+ segnum = nilfs_get_segnum_of_block(nilfs, range->start >>
+ nilfs->ns_blocksize_bits);
+ end = nilfs_get_segnum_of_block(nilfs, (range->start + range->len)
+ >> nilfs->ns_blocksize_bits);
+ minlen = range->minlen >> nilfs->ns_blocksize_bits;
+
+ if (minlen > nilfs->ns_blocks_per_segment ||
+ segnum >= nilfs->ns_nsegments ||
+ range->len < nilfs->ns_blocksize)
+ return -EINVAL;
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
+ if (end == segnum)
+ goto out;
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
+ }
+
+ kaddr = kmap(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ } else if (start + nblocks == seg_start) {
+ nblocks += seg_end - seg_start + 1;
+ } else {
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (ret < 0) {
+ kunmap(kaddr);
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ nblocks = 0;
+ }
+ }
+ kunmap(kaddr);
+ put_bh(su_bh);
+ }
+
+ if (nblocks) {
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (!ret)
+ trimmed += nblocks;
+ }
+
+out_sem:
+ up_read(&NILFS_MDT(sufile)->mi_sem);
+out:
+ range->len = trimmed << nilfs->ns_blocksize_bits;
+ return ret;
+}
+
+/**
* nilfs_sufile_read - read or get sufile inode
* @sb: super block instance
* @susize: size of a segment usage entry
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b..2434abd 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -65,6 +65,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);

/**
* nilfs_sufile_scrap - make a segment garbage
--
1.8.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 03:00:00 UTC
Permalink
Hi Andreas,
Post by Andreas Rohner
This patch adds the nilfs_sufile_trim_fs function, which takes a
fstrim_range structure and calls blkdev_issue_discard for every
clean segment in the specified range.
Thank you for doing this work.
I'll add comments inline below.
Post by Andreas Rohner
---
fs/nilfs2/sufile.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
2 files changed, 115 insertions(+)
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f..022b17f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,120 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
}
/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. For each clean segment blkdev_issue_discard
+ * function is invoked to trim it.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+ ssize_t n;
+ sector_t seg_start, seg_end, start = 0, nblocks = 0;
+ u64 segnum, end, minlen, trimmed = 0;
+ int i, ret = 0;
+ unsigned long segusages_per_block;
+ unsigned int sects_per_block;
+
+ segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+ bdev_logical_block_size(nilfs->ns_bdev);
+ segnum = nilfs_get_segnum_of_block(nilfs, range->start >>
+ nilfs->ns_blocksize_bits);
+ end = nilfs_get_segnum_of_block(nilfs, (range->start + range->len)
+ >> nilfs->ns_blocksize_bits);
+ minlen = range->minlen >> nilfs->ns_blocksize_bits;
+
+ if (minlen > nilfs->ns_blocks_per_segment ||
This check looks inappropriate. If we set a lower limit for
range->minlin, I think it should be the file system size (block device
size). For your information, the meaning of minlen is described as
follows in the man page of fstrim command:

Minimum contiguous free range to discard, in bytes. (This
value is internally rounded up to a multiple of the filesystem
block size). Free ranges smaller than this will be ignored.
By increasing this value, the fstrim operation will complete
more quickly for filesystems with badly fragmented freespace,
although not all blocks will be discarded. Default value is
zero, discard every free block.

So, too small lower limit interferes purpose of the minlen argument.
Post by Andreas Rohner
+ segnum >= nilfs->ns_nsegments ||
This is bad too, because userland programs usually don't know the
segment structure of NILFS. When we specify the partition size to
range->len, FITRIM can fail due to this check.

The upper limit of (range->start + range->len) should be
the block device size.
Post by Andreas Rohner
+ range->len < nilfs->ns_blocksize)
This looks OK.
Post by Andreas Rohner
+ return -EINVAL;
I think these EINVAL checks should be move to ioctl interface side
because they should be performed in a perspective of general file
system and shouldn't depend on the internal structure of NILFS so
much. If we need to adjust the range in this function, we should do
it silently.
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
rounded down to segment alighment before. So, it should be:

if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and

if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and

while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Post by Andreas Rohner
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
Skipping segments within segment usage blocks which are hole, are OK
since they are never written after the previous mkfs and mkfs.nilfs2
discards segments.

However, note that this also separates the extent defined with (start,
nblocks). So, (start, nblocks) should be updated and
blkdev_issue_discard() should be called as needed.
Post by Andreas Rohner
+ }
+
+ kaddr = kmap(su_bh->b_page);
Avoid using kmap()/kunmap() (if possible), these are expensive for
architectures nowadays which shares virtual address space among
processors. I think we can replace them with kmap_atomic() and
kunmap_atomic() for this function by carefully releasing the highmem
before calling blocking operations such as blkdev_issue_discard() and
nilfs_sufile_get_segment_usage_block().
Post by Andreas Rohner
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ } else if (start + nblocks == seg_start) {
+ nblocks += seg_end - seg_start + 1;
+ } else {
We should ignore the extent if the size is smaller than range->minlen.
Post by Andreas Rohner
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (ret < 0) {
+ kunmap(kaddr);
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ nblocks = 0;
+ }
+ }
+ kunmap(kaddr);
+ put_bh(su_bh);
+ }
+
+ if (nblocks) {
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (!ret)
+ trimmed += nblocks;
+ }
+
+ up_read(&NILFS_MDT(sufile)->mi_sem);
+ range->len = trimmed << nilfs->ns_blocksize_bits;
+ return ret;
+}
+
+/**
* nilfs_sufile_read - read or get sufile inode
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b..2434abd 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -65,6 +65,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
/**
* nilfs_sufile_scrap - make a segment garbage
--
1.8.5.4
Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 04:27:33 UTC
Permalink
Post by Ryusuke Konishi
Hi Andreas,
Post by Andreas Rohner
+ segnum >= nilfs->ns_nsegments ||
This is bad too, because userland programs usually don't know the
segment structure of NILFS. When we specify the partition size to
range->len, FITRIM can fail due to this check.
The upper limit of (range->start + range->len) should be
the block device size.
I mistook for this. segnum was the start segment number, so what
we should do was:

- Return -EINVAL in ioctl side if range->start is larger than the
block device size.
- Escape this function without an error setting range->len to zero if
segnum >= nilfs->ns_nsegments.

And, range->len should not be limited by the device size because it is
set to ULLONG_MAX by default. We have to accept larger range->len
values.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 09:06:56 UTC
Permalink
Hi Ryusuke,
Post by Ryusuke Konishi
Post by Andreas Rohner
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+ ssize_t n;
+ sector_t seg_start, seg_end, start = 0, nblocks = 0;
+ u64 segnum, end, minlen, trimmed = 0;
+ int i, ret = 0;
+ unsigned long segusages_per_block;
+ unsigned int sects_per_block;
+
+ segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+ bdev_logical_block_size(nilfs->ns_bdev);
+ segnum = nilfs_get_segnum_of_block(nilfs, range->start >>
+ nilfs->ns_blocksize_bits);
+ end = nilfs_get_segnum_of_block(nilfs, (range->start + range->len)
+ >> nilfs->ns_blocksize_bits);
+ minlen = range->minlen >> nilfs->ns_blocksize_bits;
+
+ if (minlen > nilfs->ns_blocks_per_segment ||
This check looks inappropriate. If we set a lower limit for
range->minlin, I think it should be the file system size (block device
size). For your information, the meaning of minlen is described as
Minimum contiguous free range to discard, in bytes. (This
value is internally rounded up to a multiple of the filesystem
block size). Free ranges smaller than this will be ignored.
By increasing this value, the fstrim operation will complete
more quickly for filesystems with badly fragmented freespace,
although not all blocks will be discarded. Default value is
zero, discard every free block.
So, too small lower limit interferes purpose of the minlen argument.
Agreed.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ segnum >= nilfs->ns_nsegments ||
This is bad too, because userland programs usually don't know the
segment structure of NILFS. When we specify the partition size to
range->len, FITRIM can fail due to this check.
The upper limit of (range->start + range->len) should be
the block device size.
ext4 also checks the range structure like that. Besides couldn't it be
possible, that the block device is bigger than the file system?
Post by Ryusuke Konishi
Post by Andreas Rohner
+ range->len < nilfs->ns_blocksize)
This looks OK.
Post by Andreas Rohner
+ return -EINVAL;
I think these EINVAL checks should be move to ioctl interface side
because they should be performed in a perspective of general file
system and shouldn't depend on the internal structure of NILFS so
much. If we need to adjust the range in this function, we should do
it silently.
I just did it the same way as it is done in ext4.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Yes we can do this, if you want to use
nilfs_sufile_segment_usages_in_block(). I would just like to state, that
my code is also correct here.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
Skipping segments within segment usage blocks which are hole, are OK
since they are never written after the previous mkfs and mkfs.nilfs2
discards segments.
However, note that this also separates the extent defined with (start,
nblocks). So, (start, nblocks) should be updated and
blkdev_issue_discard() should be called as needed.
As far as I can tell this is not necessary here. But I checked the
function again and I found a small bug further below.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ }
+
+ kaddr = kmap(su_bh->b_page);
Avoid using kmap()/kunmap() (if possible), these are expensive for
architectures nowadays which shares virtual address space among
processors. I think we can replace them with kmap_atomic() and
kunmap_atomic() for this function by carefully releasing the highmem
before calling blocking operations such as blkdev_issue_discard() and
nilfs_sufile_get_segment_usage_block().
Ok. Yes that should be possible.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ } else if (start + nblocks == seg_start) {
+ nblocks += seg_end - seg_start + 1;
+ } else {
We should ignore the extent if the size is smaller than range->minlen.
Agreed.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (ret < 0) {
+ kunmap(kaddr);
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ nblocks = 0;
This should be:

+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;


Regards,
Andreas Rohner

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 10:42:00 UTC
Permalink
Post by Andreas Rohner
Hi Ryusuke,
Post by Ryusuke Konishi
Post by Andreas Rohner
+ segnum >= nilfs->ns_nsegments ||
This is bad too, because userland programs usually don't know the
segment structure of NILFS. When we specify the partition size to
range->len, FITRIM can fail due to this check.
The upper limit of (range->start + range->len) should be
the block device size.
ext4 also checks the range structure like that. Besides couldn't it be
possible, that the block device is bigger than the file system?
As I mentioned in my second mail, I misunderstood the meaning of this
check at first.

As you pointed out, the block device size may be bigger than the file
system size after shrinking the file system. So, sbp->s_dev_size
should be used for for this check. But it is a bit complicated since
the current nilfs object doesn't have on-memory copy of it.

It is OK to use nilfs->ns_nsegments as its alternative.
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (ret < 0) {
+ kunmap(kaddr);
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ nblocks = 0;
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
Yes, that's right.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 11:04:52 UTC
Permalink
Post by Ryusuke Konishi
Post by Andreas Rohner
Hi Ryusuke,
Post by Ryusuke Konishi
Post by Andreas Rohner
+ segnum >= nilfs->ns_nsegments ||
This is bad too, because userland programs usually don't know the
segment structure of NILFS. When we specify the partition size to
range->len, FITRIM can fail due to this check.
The upper limit of (range->start + range->len) should be
the block device size.
ext4 also checks the range structure like that. Besides couldn't it be
possible, that the block device is bigger than the file system?
As I mentioned in my second mail, I misunderstood the meaning of this
check at first.
As you pointed out, the block device size may be bigger than the file
system size after shrinking the file system. So, sbp->s_dev_size
should be used for for this check. But it is a bit complicated since
the current nilfs object doesn't have on-memory copy of it.
It is OK to use nilfs->ns_nsegments as its alternative.
Ok, just to be clear. According to your second mail it should look
something like this in nilfs_sufile_trim_fs():

if (segnum >= nilfs->ns_nsegments)
goto out;

And in nilfs_ioctl_trim_fs():

if (range.len < nilfs->ns_blocksize ||
range.start >= (nilfs->ns_nsegments *
nilfs->ns_blocks_per_segment) <<
nilfs->ns_blocksize_bits)
return -EINVAL;


Is that correct?

Regards,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 11:39:01 UTC
Permalink
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Hi Ryusuke,
Post by Ryusuke Konishi
Post by Andreas Rohner
+ segnum >= nilfs->ns_nsegments ||
This is bad too, because userland programs usually don't know the
segment structure of NILFS. When we specify the partition size to
range->len, FITRIM can fail due to this check.
The upper limit of (range->start + range->len) should be
the block device size.
ext4 also checks the range structure like that. Besides couldn't it be
possible, that the block device is bigger than the file system?
As I mentioned in my second mail, I misunderstood the meaning of this
check at first.
As you pointed out, the block device size may be bigger than the file
system size after shrinking the file system. So, sbp->s_dev_size
should be used for for this check. But it is a bit complicated since
the current nilfs object doesn't have on-memory copy of it.
It is OK to use nilfs->ns_nsegments as its alternative.
Ok, just to be clear. According to your second mail it should look
if (segnum >= nilfs->ns_nsegments)
goto out;
if (range.len < nilfs->ns_blocksize ||
range.start >= (nilfs->ns_nsegments *
nilfs->ns_blocks_per_segment) <<
nilfs->ns_blocksize_bits)
return -EINVAL;
Is that correct?
Yes, exactly.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 09:17:57 UTC
Permalink
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.

br,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 10:06:10 UTC
Permalink
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.
The problem seems that 'end' is not calculated properly.
I think it should be

end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len + <segment-size-in-bytes> - 1)
Post by Andreas Rohner
Post by Ryusuke Konishi
nilfs->ns_blocksize_bits) - 1;
or can be simplified to

end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len - 1) >> nilfs->ns_blocksize_bits);

if range->len > 0 is guaranteed.


The calculation of segnum extends the range to be discarded since it
is rounded down to segment alignment, but that of the current
implementation for 'end' truncates the range. Both should be
extended.


Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 10:34:16 UTC
Permalink
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.
The problem seems that 'end' is not calculated properly.
I think it should be
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len + <segment-size-in-bytes> - 1)
Post by Andreas Rohner
Post by Ryusuke Konishi
nilfs->ns_blocksize_bits) - 1;
or can be simplified to
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len - 1) >> nilfs->ns_blocksize_bits);
if range->len > 0 is guaranteed.
The calculation of segnum extends the range to be discarded since it
is rounded down to segment alignment, but that of the current
implementation for 'end' truncates the range. Both should be
extended.
Then shouldn't both be truncated? The user expects that less bytes than
range->len are discarded but not more. Maybe I am wrong and it is
defined somewhere...

Regards,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 11:21:43 UTC
Permalink
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.
The problem seems that 'end' is not calculated properly.
I think it should be
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len + <segment-size-in-bytes> - 1)
Post by Andreas Rohner
Post by Ryusuke Konishi
nilfs->ns_blocksize_bits) - 1;
or can be simplified to
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len - 1) >> nilfs->ns_blocksize_bits);
if range->len > 0 is guaranteed.
The calculation of segnum extends the range to be discarded since it
is rounded down to segment alignment, but that of the current
implementation for 'end' truncates the range. Both should be
extended.
Then shouldn't both be truncated? The user expects that less bytes than
range->len are discarded but not more. Maybe I am wrong and it is
defined somewhere...
Uum, xfs looks to be truncating both. This seems to be authentic
when we think the original meaning of the fitrim range.

I first thought truncating the range can generate gaps between
consecutive calls of FITRIM (if we use FITRIM like that). But now I'm
tempted to take the xfs approach. Let's take this semantics.

So, we now have to round up range->start to calculate (start) segnum.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 11:53:54 UTC
Permalink
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.
The problem seems that 'end' is not calculated properly.
I think it should be
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len + <segment-size-in-bytes> - 1)
Post by Andreas Rohner
Post by Ryusuke Konishi
nilfs->ns_blocksize_bits) - 1;
or can be simplified to
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len - 1) >> nilfs->ns_blocksize_bits);
if range->len > 0 is guaranteed.
The calculation of segnum extends the range to be discarded since it
is rounded down to segment alignment, but that of the current
implementation for 'end' truncates the range. Both should be
extended.
Then shouldn't both be truncated? The user expects that less bytes than
range->len are discarded but not more. Maybe I am wrong and it is
defined somewhere...
Uum, xfs looks to be truncating both. This seems to be authentic
when we think the original meaning of the fitrim range.
I first thought truncating the range can generate gaps between
consecutive calls of FITRIM (if we use FITRIM like that).
Hmm good point I didn't think of that. Then by extending both sides the
overlapping segments would get discarded twice with consecutive calls,
which shouldn't be a problem.
Post by Ryusuke Konishi
But now I'm
tempted to take the xfs approach. Let's take this semantics.
So, we now have to round up range->start to calculate (start) segnum.
Ok. I think in the end it doesn't really matter, because most users will
use the default values:

range->start = 0
range->minlen = 0
range->len = ULLONG_MAX

Regards,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 13:28:11 UTC
Permalink
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.
The problem seems that 'end' is not calculated properly.
I think it should be
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len + <segment-size-in-bytes> - 1)
Post by Andreas Rohner
Post by Ryusuke Konishi
nilfs->ns_blocksize_bits) - 1;
or can be simplified to
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len - 1) >> nilfs->ns_blocksize_bits);
if range->len > 0 is guaranteed.
The calculation of segnum extends the range to be discarded since it
is rounded down to segment alignment, but that of the current
implementation for 'end' truncates the range. Both should be
extended.
Then shouldn't both be truncated? The user expects that less bytes than
range->len are discarded but not more. Maybe I am wrong and it is
defined somewhere...
Uum, xfs looks to be truncating both. This seems to be authentic
when we think the original meaning of the fitrim range.
I first thought truncating the range can generate gaps between
consecutive calls of FITRIM (if we use FITRIM like that).
Hmm good point I didn't think of that. Then by extending both sides the
overlapping segments would get discarded twice with consecutive calls,
which shouldn't be a problem.
Post by Ryusuke Konishi
But now I'm
tempted to take the xfs approach. Let's take this semantics.
So, we now have to round up range->start to calculate (start) segnum.
Ok. I think in the end it doesn't really matter, because most users will
range->start = 0
range->minlen = 0
range->len = ULLONG_MAX
Or, we have the third option to avoid this issue. It's simply
truncating both to sector boundary. In this case, the gap problem
will not arise since userland programs usually separate the range
considering sector boundary or file system block size boundary. I
think this is more preferable than truncating it to segment size.

How do you think of this ?

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 13:51:28 UTC
Permalink
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
Yes, this adjustment is what we should do here, but 'end' segnum was
if (end >= nilfs->ns_nsegments)
end = nilfs->ns_nsegments - 1;
Post by Andreas Rohner
+ if (end == segnum)
+ goto out;
and
if (end < segnum)
goto out;
Post by Andreas Rohner
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
and
while (segnum <= end) {
Post by Andreas Rohner
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum);
Then, we can reuse nilfs_sufile_segment_usages_in_block() to calculate
'n'.
Actually I don't think that is correct. What if range->start = 0 and
range->end = 8MB. Then segnum = 0 and end = 1. Your code would discard
segment 0 and segment 1, whereas my version would discard only segment
0, which seems to be more reasonable.
The problem seems that 'end' is not calculated properly.
I think it should be
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len + <segment-size-in-bytes> - 1)
Post by Andreas Rohner
Post by Ryusuke Konishi
nilfs->ns_blocksize_bits) - 1;
or can be simplified to
end = nilfs_get_segnum_of_block(
nilfs,
(range->start + range->len - 1) >> nilfs->ns_blocksize_bits);
if range->len > 0 is guaranteed.
The calculation of segnum extends the range to be discarded since it
is rounded down to segment alignment, but that of the current
implementation for 'end' truncates the range. Both should be
extended.
Then shouldn't both be truncated? The user expects that less bytes than
range->len are discarded but not more. Maybe I am wrong and it is
defined somewhere...
Uum, xfs looks to be truncating both. This seems to be authentic
when we think the original meaning of the fitrim range.
I first thought truncating the range can generate gaps between
consecutive calls of FITRIM (if we use FITRIM like that).
Hmm good point I didn't think of that. Then by extending both sides the
overlapping segments would get discarded twice with consecutive calls,
which shouldn't be a problem.
Post by Ryusuke Konishi
But now I'm
tempted to take the xfs approach. Let's take this semantics.
So, we now have to round up range->start to calculate (start) segnum.
Ok. I think in the end it doesn't really matter, because most users will
range->start = 0
range->minlen = 0
range->len = ULLONG_MAX
Or, we have the third option to avoid this issue. It's simply
truncating both to sector boundary. In this case, the gap problem
will not arise since userland programs usually separate the range
considering sector boundary or file system block size boundary. I
think this is more preferable than truncating it to segment size.
How do you think of this ?
Yes I think that is a good idea. It shouldn't be hard to implement
either. I will try it and then send you a new version of the patch.

Regards,
Andreas Rohner

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-15 13:34:26 UTC
Permalink
This patch adds support for the FITRIM ioctl, which enables user space
tools to issue TRIM/DISCARD requests to the underlying device. Every
clean segment within the specified range will be discarded.

Signed-off-by: Andreas Rohner <andreas.rohner-***@public.gmane.org>
---
fs/nilfs2/ioctl.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021..cb1bea4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1072,6 +1072,49 @@ out:
}

/**
+ * nilfs_ioctl_trim_fs() - trim ioctl handle function
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
+ * performs the actual trim operation.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(unsigned int, (unsigned int)range.minlen,
+ q->limits.discard_granularity);
+
+ down_read(&nilfs->ns_segctor_sem);
+ ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
+ up_read(&nilfs->ns_segctor_sem);
+
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
* nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
* @inode: inode object
* @argp: pointer on argument from userspace
@@ -1205,6 +1248,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_resize(inode, filp, argp);
case NILFS_IOCTL_SET_ALLOC_RANGE:
return nilfs_ioctl_set_alloc_range(inode, argp);
+ case FITRIM:
+ return nilfs_ioctl_trim_fs(inode, argp);
default:
return -ENOTTY;
}
@@ -1235,6 +1280,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_SYNC:
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
+ case FITRIM:
break;
default:
return -ENOIOCTLCMD;
--
1.8.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-17 03:01:41 UTC
Permalink
Post by Andreas Rohner
This patch adds support for the FITRIM ioctl, which enables user space
tools to issue TRIM/DISCARD requests to the underlying device. Every
clean segment within the specified range will be discarded.
---
fs/nilfs2/ioctl.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021..cb1bea4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
}
/**
+ * nilfs_ioctl_trim_fs() - trim ioctl handle function
+ *
+ * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
+ * performs the actual trim operation.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
+ struct fstrim_range range;
+ int ret = 0;
This initialization is unnecessary.

Other looks OK to me.

Regards,
Ryusuke Konishi
Post by Andreas Rohner
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(unsigned int, (unsigned int)range.minlen,
+ q->limits.discard_granularity);
+
+ down_read(&nilfs->ns_segctor_sem);
+ ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
+ up_read(&nilfs->ns_segctor_sem);
+
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
* nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
@@ -1205,6 +1248,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_resize(inode, filp, argp);
return nilfs_ioctl_set_alloc_range(inode, argp);
+ return nilfs_ioctl_trim_fs(inode, argp);
return -ENOTTY;
}
@@ -1235,6 +1280,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
break;
return -ENOIOCTLCMD;
--
1.8.5.4
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 22:39:50 UTC
Permalink
Hi,

This patch adds support for the FITRIM ioctl, which allows user space
tools like fstrim to issue TRIM/DISCARD requests to the underlying
device. It takes a fstrim_range structure as a parameter and for every
clean segment in the specified range the function blkdev_issue_discard
is called. The range is truncated to sector boundaries.

Best regards,
Andreas Rohner

---
v1->v2 (based on review by Ryusuke Konishi)
* Remove upper limit of minlen
* Add check for minlen
* Round range to sector boundary instead of segment boundary
* Fix minor bug
* Use kmap_atomic instead of kmap
* Move input checks to ioctl.c
* Use nilfs_sufile_segment_usages_in_block()
--
Andreas Rohner (2):
nilfs2: add nilfs_sufile_trim_fs to trim clean segs
nilfs2: add FITRIM ioctl support for nilfs2

fs/nilfs2/ioctl.c | 52 +++++++++++++++++++
fs/nilfs2/sufile.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
3 files changed, 197 insertions(+)
--
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 22:39:51 UTC
Permalink
This patch adds the nilfs_sufile_trim_fs function, which takes a
fstrim_range structure and calls blkdev_issue_discard for every
clean segment in the specified range. The range is truncated to sector
boundaries.

Signed-off-by: Andreas Rohner <andreas.rohner-***@public.gmane.org>
---
fs/nilfs2/sufile.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
2 files changed, 145 insertions(+)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f..3605cc9 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,150 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
}

/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ * @sufile: inode of segment usage file
+ * @range: fstrim_range structure
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. start is rounded up to the next sector boundary
+ * and start+len is rounded down. For each clean segment blkdev_issue_discard
+ * function is invoked to trim it.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
+ sector_t seg_start, seg_end, real_start, real_end,
+ start = 0, nblocks = 0;
+ u64 segnum, end, minlen, trimmed = 0;
+ int ret = 0;
+ unsigned int sect_size, sects_per_block;
+
+ sect_size = bdev_logical_block_size(nilfs->ns_bdev);
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) / sect_size;
+ real_start = (range->start + sect_size - 1) / sect_size;
+ real_end = (range->start + range->len) / sect_size;
+ segnum = nilfs_get_segnum_of_block(nilfs, real_start / sects_per_block);
+ end = nilfs_get_segnum_of_block(nilfs, ((real_end + sects_per_block - 1)
+ / sects_per_block) + nilfs->ns_blocks_per_segment - 1);
+ minlen = range->minlen / sect_size;
+
+
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
+ if (segnum >= nilfs->ns_nsegments || end <= segnum)
+ goto out;
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
+ n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
+ end - 1);
+
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
+ }
+
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
+ su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ continue;
+ }
+
+ if (start + nblocks == seg_start) {
+ /* add to previous extent */
+ nblocks += seg_end - seg_start + 1;
+ continue;
+ }
+
+ /* discard previous extent */
+ start *= sects_per_block;
+ nblocks *= sects_per_block;
+ if (start < real_start) {
+ nblocks -= real_start - start;
+ start = real_start;
+ }
+ if (start + nblocks > real_end)
+ nblocks = real_end - start;
+
+ if (nblocks >= minlen) {
+ kunmap_atomic(kaddr);
+
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start, nblocks, GFP_NOFS, 0);
+ if (ret < 0) {
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ }
+
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ }
+ kunmap_atomic(kaddr);
+ put_bh(su_bh);
+ }
+
+
+ if (nblocks) {
+ /* discard last extent */
+ start *= sects_per_block;
+ nblocks *= sects_per_block;
+ if (start < real_start) {
+ nblocks -= real_start - start;
+ start = real_start;
+ }
+ if (start + nblocks > real_end)
+ nblocks = real_end - start;
+
+ if (nblocks >= minlen) {
+ ret = blkdev_issue_discard(nilfs->ns_bdev, start,
+ nblocks, GFP_NOFS, 0);
+ if (!ret)
+ trimmed += nblocks;
+ }
+ }
+
+out_sem:
+ up_read(&NILFS_MDT(sufile)->mi_sem);
+out:
+ range->len = trimmed * sect_size;
+ return ret;
+}
+
+/**
* nilfs_sufile_read - read or get sufile inode
* @sb: super block instance
* @susize: size of a segment usage entry
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b..2434abd 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -65,6 +65,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);

/**
* nilfs_sufile_scrap - make a segment garbage
--
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-18 18:37:33 UTC
Permalink
Post by Andreas Rohner
This patch adds the nilfs_sufile_trim_fs function, which takes a
fstrim_range structure and calls blkdev_issue_discard for every
clean segment in the specified range. The range is truncated to sector
boundaries.
---
fs/nilfs2/sufile.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
2 files changed, 145 insertions(+)
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f..3605cc9 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,150 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
}
/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. start is rounded up to the next sector boundary
+ * and start+len is rounded down. For each clean segment blkdev_issue_discard
+ * function is invoked to trim it.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
+ sector_t seg_start, seg_end, real_start, real_end,
+ start = 0, nblocks = 0;
+ u64 segnum, end, minlen, trimmed = 0;
+ int ret = 0;
+ unsigned int sect_size, sects_per_block;
+
+ sect_size = bdev_logical_block_size(nilfs->ns_bdev);
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) / sect_size;
+ real_start = (range->start + sect_size - 1) / sect_size;
+ real_end = (range->start + range->len) / sect_size;
Why not use start_sect, end_sect instead of real_start, real_end?
real_{start,end} are not intuitive to me.

We need to use do_div() for these divisions, and DIV_ROUND_UP_ULL()
macro should be applied to round up the start sector.

Moreover, we have to avoid overflow in "range->start + range->len".
Actually, range->len is usually set to UULONG_MAX.

So, these will be as follows:

u64 len = range->len;

...

do_div(len, sect_size);
if (!len)
goto out;

...
start_sect = DIV_ROUND_UP_ULL(range->start, sect_size);
end_sect = start_sect + len - 1; /* this end_sect is inclusive */

Note that, we also should care about large range->start to avoid
overflow in substitution to start_sect (sector_t) since sector_t may
be 32-bit. We should check it before the division.

Here, I recant my earlier comment. We should do the following check
in this function to clarify that the overflow issue is handled
properly.

u64 max_byte =
((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segments)
<< nilfs->ns_blocksize_bits;

...
if (range.len < nilfs->ns_blocksize || range.start >= max_byte)
return -EINVAL;
...
(divisions)
Post by Andreas Rohner
+ segnum = nilfs_get_segnum_of_block(nilfs, real_start / sects_per_block);
+ end = nilfs_get_segnum_of_block(nilfs, ((real_end + sects_per_block - 1)
+ / sects_per_block) + nilfs->ns_blocks_per_segment - 1);
It would be better to use the following intermediate variables to
improve readability of these calculations.

And, these calculations need sector_div() and DIV_ROUND_UP_SECTOR_T()
macro:

start_block = start_sect;
sector_div(start_block, sects_per_block);

end_block = DIV_ROUND_UP_SECTOR_T(end_sect, sects_per_block);

segnum = nilfs_get_segnum_of_block(nilfs, start_block);
end = nilfs_get_segnum_of_block(
nilfs, end_block + nilfs->ns_blocks_per_segment - 1);
Post by Andreas Rohner
+ minlen = range->minlen / sect_size;
And, this one needs do_div():

minlen = range->minlen;
do_div(minlen, sect_size);
Post by Andreas Rohner
+
+
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
+ if (segnum >= nilfs->ns_nsegments || end <= segnum)
+ goto out;
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
+ n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
+ end - 1);
+
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
+ }
+
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
+ su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ continue;
+ }
+
+ if (start + nblocks == seg_start) {
+ /* add to previous extent */
+ nblocks += seg_end - seg_start + 1;
+ continue;
+ }
+
+ /* discard previous extent */
+ start *= sects_per_block;
+ nblocks *= sects_per_block;
+ if (start < real_start) {
+ nblocks -= real_start - start;
+ start = real_start;
+ }
+ if (start + nblocks > real_end)
+ nblocks = real_end - start;
Why do you need this adjustment during discarding "previous" extent ?
Post by Andreas Rohner
+ if (nblocks >= minlen) {
+ kunmap_atomic(kaddr);
+
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start, nblocks, GFP_NOFS, 0);
+ if (ret < 0) {
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ }
+
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ }
+ kunmap_atomic(kaddr);
+ put_bh(su_bh);
+ }
+
+
+ if (nblocks) {
+ /* discard last extent */
+ start *= sects_per_block;
+ nblocks *= sects_per_block;
+ if (start < real_start) {
+ nblocks -= real_start - start;
+ start = real_start;
+ }
+ if (start + nblocks > real_end)
+ nblocks = real_end - start;
+
+ if (nblocks >= minlen) {
+ ret = blkdev_issue_discard(nilfs->ns_bdev, start,
+ nblocks, GFP_NOFS, 0);
+ if (!ret)
+ trimmed += nblocks;
+ }
+ }
+
+ up_read(&NILFS_MDT(sufile)->mi_sem);
+ range->len = trimmed * sect_size;
+ return ret;
+}
+
+/**
* nilfs_sufile_read - read or get sufile inode
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b..2434abd 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -65,6 +65,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
/**
* nilfs_sufile_scrap - make a segment garbage
--
1.9.0
Please try to compile this patch both for 32-bit kernel and 64-bit
kernel to test if the patch is architecture independent.


Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-19 06:36:58 UTC
Permalink
Post by Ryusuke Konishi
Post by Andreas Rohner
This patch adds the nilfs_sufile_trim_fs function, which takes a
fstrim_range structure and calls blkdev_issue_discard for every
clean segment in the specified range. The range is truncated to sector
boundaries.
---
fs/nilfs2/sufile.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nilfs2/sufile.h | 1 +
2 files changed, 145 insertions(+)
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f..3605cc9 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,150 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
}
/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. start is rounded up to the next sector boundary
+ * and start+len is rounded down. For each clean segment blkdev_issue_discard
+ * function is invoked to trim it.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
+ sector_t seg_start, seg_end, real_start, real_end,
+ start = 0, nblocks = 0;
+ u64 segnum, end, minlen, trimmed = 0;
+ int ret = 0;
+ unsigned int sect_size, sects_per_block;
+
+ sect_size = bdev_logical_block_size(nilfs->ns_bdev);
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) / sect_size;
+ real_start = (range->start + sect_size - 1) / sect_size;
+ real_end = (range->start + range->len) / sect_size;
Why not use start_sect, end_sect instead of real_start, real_end?
real_{start,end} are not intuitive to me.
Yes that looks better.
Post by Ryusuke Konishi
We need to use do_div() for these divisions, and DIV_ROUND_UP_ULL()
macro should be applied to round up the start sector.
Moreover, we have to avoid overflow in "range->start + range->len".
Actually, range->len is usually set to UULONG_MAX.
Ah yes I forgot to test that case.
Post by Ryusuke Konishi
u64 len = range->len;
...
do_div(len, sect_size);
if (!len)
goto out;
...
start_sect = DIV_ROUND_UP_ULL(range->start, sect_size);
end_sect = start_sect + len - 1; /* this end_sect is inclusive */
I don't get why this has to be inclusive. To me this seems to be a
matter of taste. I think it is much easier to reason about this stuff
and more "natural", if start_sect is inclusive and end_sect is
exclusive. Then segnum is inclusive and end is exclusive. It is just
like with pointer arithmetic.
Post by Ryusuke Konishi
Note that, we also should care about large range->start to avoid
overflow in substitution to start_sect (sector_t) since sector_t may
be 32-bit. We should check it before the division.
Here, I recant my earlier comment. We should do the following check
in this function to clarify that the overflow issue is handled
properly.
Ok.
Post by Ryusuke Konishi
u64 max_byte =
((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segments)
<< nilfs->ns_blocksize_bits;
...
if (range.len < nilfs->ns_blocksize || range.start >= max_byte)
return -EINVAL;
...
(divisions)
Post by Andreas Rohner
+ segnum = nilfs_get_segnum_of_block(nilfs, real_start / sects_per_block);
+ end = nilfs_get_segnum_of_block(nilfs, ((real_end + sects_per_block - 1)
+ / sects_per_block) + nilfs->ns_blocks_per_segment - 1);
It would be better to use the following intermediate variables to
improve readability of these calculations.
Ok.
Post by Ryusuke Konishi
And, these calculations need sector_div() and DIV_ROUND_UP_SECTOR_T()
start_block = start_sect;
sector_div(start_block, sects_per_block);
end_block = DIV_ROUND_UP_SECTOR_T(end_sect, sects_per_block);
segnum = nilfs_get_segnum_of_block(nilfs, start_block);
end = nilfs_get_segnum_of_block(
nilfs, end_block + nilfs->ns_blocks_per_segment - 1);
Post by Andreas Rohner
+ minlen = range->minlen / sect_size;
minlen = range->minlen;
do_div(minlen, sect_size);
Post by Andreas Rohner
+
+
+ if (end > nilfs->ns_nsegments)
+ end = nilfs->ns_nsegments;
+ if (segnum >= nilfs->ns_nsegments || end <= segnum)
+ goto out;
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum < end) {
+ n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
+ end - 1);
+
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
+ }
+
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
+ su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ continue;
+ }
+
+ if (start + nblocks == seg_start) {
+ /* add to previous extent */
+ nblocks += seg_end - seg_start + 1;
+ continue;
+ }
+
+ /* discard previous extent */
+ start *= sects_per_block;
+ nblocks *= sects_per_block;
+ if (start < real_start) {
+ nblocks -= real_start - start;
+ start = real_start;
+ }
+ if (start + nblocks > real_end)
+ nblocks = real_end - start;
Why do you need this adjustment during discarding "previous" extent ?
You are right I don't need it.
Post by Ryusuke Konishi
Post by Andreas Rohner
+ if (nblocks >= minlen) {
+ kunmap_atomic(kaddr);
+
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start, nblocks, GFP_NOFS, 0);
+ if (ret < 0) {
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ trimmed += nblocks;
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ }
+
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ }
+ kunmap_atomic(kaddr);
+ put_bh(su_bh);
+ }
+
+
+ if (nblocks) {
+ /* discard last extent */
+ start *= sects_per_block;
+ nblocks *= sects_per_block;
+ if (start < real_start) {
+ nblocks -= real_start - start;
+ start = real_start;
+ }
+ if (start + nblocks > real_end)
+ nblocks = real_end - start;
+
+ if (nblocks >= minlen) {
+ ret = blkdev_issue_discard(nilfs->ns_bdev, start,
+ nblocks, GFP_NOFS, 0);
+ if (!ret)
+ trimmed += nblocks;
+ }
+ }
+
+ up_read(&NILFS_MDT(sufile)->mi_sem);
+ range->len = trimmed * sect_size;
+ return ret;
+}
+
+/**
* nilfs_sufile_read - read or get sufile inode
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b..2434abd 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -65,6 +65,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
/**
* nilfs_sufile_scrap - make a segment garbage
--
1.9.0
Please try to compile this patch both for 32-bit kernel and 64-bit
kernel to test if the patch is architecture independent.
Ok.

With all the proper division macros it gets very complicated. I think it
would simplify things if we just truncate to block size instead of
sector size. Then we could use simple bit shifts. It would look
something like this:

if (range->len < nilfs->ns_blocksize ||
range->start >= max_byte)
return -EINVAL;
/* sector_t could be 32 bit */
if (range->len > max_byte)
range->len = max_byte;

sects_per_block = (1 << nilfs->ns_blocksize_bits) /
bdev_logical_block_size(nilfs->ns_bdev);

start_block = (range->start + nilfs->ns_blocksize - 1) >>
nilfs->ns_blocksize_bits;
end_block = start_block + (range->len >>
nilfs->ns_blocksize_bits);

segnum = nilfs_get_segnum_of_block(nilfs, start_block);
end = nilfs_get_segnum_of_block(nilfs, end_block +
nilfs->ns_blocks_per_segment - 1);

minlen = range->minlen >> nilfs->ns_blocksize_bits;


What do you think?

Regards,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Ryusuke Konishi
2014-02-20 08:12:36 UTC
Permalink
Post by Andreas Rohner
Post by Ryusuke Konishi
Please try to compile this patch both for 32-bit kernel and 64-bit
kernel to test if the patch is architecture independent.
Ok.
With all the proper division macros it gets very complicated. I think it
would simplify things if we just truncate to block size instead of
sector size. Then we could use simple bit shifts. It would look
if (range->len < nilfs->ns_blocksize ||
range->start >= max_byte)
return -EINVAL;
/* sector_t could be 32 bit */
if (range->len > max_byte)
range->len = max_byte;
sects_per_block = (1 << nilfs->ns_blocksize_bits) /
bdev_logical_block_size(nilfs->ns_bdev);
start_block = (range->start + nilfs->ns_blocksize - 1) >>
nilfs->ns_blocksize_bits;
end_block = start_block + (range->len >>
nilfs->ns_blocksize_bits);
segnum = nilfs_get_segnum_of_block(nilfs, start_block);
end = nilfs_get_segnum_of_block(nilfs, end_block +
nilfs->ns_blocks_per_segment - 1);
minlen = range->minlen >> nilfs->ns_blocksize_bits;
What do you think?
Yes, sector-based calculations looks a bit complicated. Your idea,
adjusting positions to fs-block alignment, is reasonable. I agree
with the approach.
Post by Andreas Rohner
Post by Ryusuke Konishi
u64 len = range->len;
...
do_div(len, sect_size);
if (!len)
goto out;
...
start_sect = DIV_ROUND_UP_ULL(range->start, sect_size);
end_sect = start_sect + len - 1; /* this end_sect is inclusive */
I don't get why this has to be inclusive. To me this seems to be a
matter of taste. I think it is much easier to reason about this stuff
and more "natural", if start_sect is inclusive and end_sect is
exclusive. Then segnum is inclusive and end is exclusive. It is just
like with pointer arithmetic.
Yes, it is implementation matter (taste). The above comment just
notice that the result of the calculation is inclusive which differs
from the original meaning. What I hope there, is that usage of
variables (e.g. the edge is inclusive or exclusive) is taken care to
avoid confusion.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Rohner
2014-02-17 22:39:52 UTC
Permalink
This patch adds support for the FITRIM ioctl, which enables user space
tools to issue TRIM/DISCARD requests to the underlying device. Every
clean segment within the specified range will be discarded.

Signed-off-by: Andreas Rohner <andreas.rohner-***@public.gmane.org>
---
fs/nilfs2/ioctl.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021..cf21dbd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1072,6 +1072,55 @@ out:
}

/**
+ * nilfs_ioctl_trim_fs() - trim ioctl handle function
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
+ * performs the actual trim operation.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
+ struct fstrim_range range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(unsigned int, (unsigned int)range.minlen,
+ q->limits.discard_granularity);
+
+ if (range.len < nilfs->ns_blocksize ||
+ range.start >= (nilfs->ns_nsegments *
+ nilfs->ns_blocks_per_segment) <<
+ nilfs->ns_blocksize_bits)
+ return -EINVAL;
+
+ down_read(&nilfs->ns_segctor_sem);
+ ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
+ up_read(&nilfs->ns_segctor_sem);
+
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
* nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
* @inode: inode object
* @argp: pointer on argument from userspace
@@ -1205,6 +1254,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_resize(inode, filp, argp);
case NILFS_IOCTL_SET_ALLOC_RANGE:
return nilfs_ioctl_set_alloc_range(inode, argp);
+ case FITRIM:
+ return nilfs_ioctl_trim_fs(inode, argp);
default:
return -ENOTTY;
}
@@ -1235,6 +1286,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_SYNC:
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
+ case FITRIM:
break;
default:
return -ENOIOCTLCMD;
--
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-***@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Loading...