[PATCH] xfs: Don't flush inodes when project quota exceeded

classic Classic list List threaded Threaded
14 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] xfs: Don't flush inodes when project quota exceeded

Jan Kara
When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
This makes handling of writes over project quota rather slow as a simple test
program shows:
        fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
        for (i = 0; i < 50000; i++)
                pwrite(fd, buf, 4096, i*4096);

Writing 200 MB like this into a directory with 100 MB project quota takes
around 6 minutes while it takes about 2 seconds with this patch applied. This
actually happens in a real world load when nfs pushes data into a directory
which is over project quota.

Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
from exceeded project quota (not requiring flushing).

As a side effect this patch fixes inconsistency where e.g. xfs_create()
returned EDQUOT even when project quota was exceeded.

Signed-off-by: Jan Kara <[hidden email]>
---
 fs/xfs/xfs_bmap.c        |    3 ++-
 fs/xfs/xfs_iomap.c       |    8 ++++++--
 fs/xfs/xfs_linux.h       |    1 +
 fs/xfs/xfs_qm.c          |   13 +++++--------
 fs/xfs/xfs_quota.h       |    2 +-
 fs/xfs/xfs_trans_dquot.c |   18 +++++++++---------
 6 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa7..027398f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4471,7 +4471,8 @@ xfs_bmapi_reserve_delalloc(
  * allocated blocks already inside this loop.
  */
  error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
- rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ XFS_QMOPT_EPDQUOT |
+ (rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS));
  if (error)
  return error;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6..86e8016 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -428,6 +428,7 @@ retry:
  case 0:
  case ENOSPC:
  case EDQUOT:
+ case EPDQUOT:
  break;
  default:
  return XFS_ERROR(error);
@@ -441,8 +442,11 @@ retry:
  */
  if (nimaps == 0) {
  trace_xfs_delalloc_enospc(ip, offset, count);
- if (flushed)
- return XFS_ERROR(error ? error : ENOSPC);
+ if (flushed) {
+ if (error == 0 || error == EPDQUOT)
+ error = ENOSPC;
+ return XFS_ERROR(error);
+ }
 
  if (error == ENOSPC) {
  xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f..31368df 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -145,6 +145,7 @@
 #define ENOATTR ENODATA /* Attribute not found */
 #define EWRONGFS EINVAL /* Mount with wrong filesystem type */
 #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+#define EPDQUOT EXFULL /* Project quota exceeded */
 
 #define SYNCHRONIZE() barrier()
 #define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0..99ace03 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1790,7 +1790,7 @@ xfs_qm_vop_chown_reserve(
  uint flags)
 {
  xfs_mount_t *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
+ uint delblks, blkflags;
  xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
  int error;
 
@@ -1817,11 +1817,8 @@ xfs_qm_vop_chown_reserve(
  }
  }
  if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
- if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
- prjflags = XFS_QMOPT_ENOSPC;
-
- if (prjflags ||
+ if ((XFS_IS_PQUOTA_ON(ip->i_mount) &&
+     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) ||
     (XFS_IS_GQUOTA_ON(ip->i_mount) &&
      ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
  delblksgdq = gdqp;
@@ -1834,7 +1831,7 @@ xfs_qm_vop_chown_reserve(
 
  if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
  delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags)))
+ flags | blkflags)))
  return (error);
 
  /*
@@ -1851,7 +1848,7 @@ xfs_qm_vop_chown_reserve(
  ASSERT(unresudq || unresgdq);
  if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
  delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags)))
+ flags | blkflags)))
  return (error);
  xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
  unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index b50ec5b..264b455 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -203,7 +203,7 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
+#define XFS_QMOPT_EPDQUOT 0x0004000 /* return EPDQUOT when project quota exceeded */
 
 /*
  * flags to xfs_trans_mod_dquot to indicate which field needs to be
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 0c7fa54..27acce3 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -728,8 +728,6 @@ xfs_trans_dqresv(
 
 error_return:
  xfs_dqunlock(dqp);
- if (flags & XFS_QMOPT_ENOSPC)
- return ENOSPC;
  return EDQUOT;
 }
 
@@ -741,7 +739,6 @@ error_return:
  * approach.
  *
  * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
- *   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
  *   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
  *   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
  * dquots are unlocked on return, if they were not locked by caller.
@@ -767,8 +764,7 @@ xfs_trans_reserve_quota_bydquots(
  ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
 
  if (udqp) {
- error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
- (flags & ~XFS_QMOPT_ENOSPC));
+ error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
  if (error)
  return error;
  resvd = 1;
@@ -785,6 +781,12 @@ xfs_trans_reserve_quota_bydquots(
  xfs_trans_dqresv(tp, mp, udqp,
  -nblks, -ninos, flags);
  }
+ if (error == EDQUOT && XFS_IS_PQUOTA_ON(mp)) {
+ if (flags & XFS_QMOPT_EPDQUOT)
+ error = EPDQUOT;
+ else
+ error = ENOSPC;
+ }
  return error;
  }
  }
@@ -813,16 +815,14 @@ xfs_trans_reserve_quota_nblks(
 
  if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
  return 0;
- if (XFS_IS_PQUOTA_ON(mp))
- flags |= XFS_QMOPT_ENOSPC;
 
  ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
  ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
 
  ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+ ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_EPDQUOT)) ==
  XFS_TRANS_DQ_RES_RTBLKS ||
-       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_EPDQUOT)) ==
  XFS_TRANS_DQ_RES_BLKS);
 
  /*
--
1.7.1

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Jan Kara
On Tue 13-11-12 01:36:13, Jan Kara wrote:

> When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> This makes handling of writes over project quota rather slow as a simple test
> program shows:
> fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> for (i = 0; i < 50000; i++)
> pwrite(fd, buf, 4096, i*4096);
>
> Writing 200 MB like this into a directory with 100 MB project quota takes
> around 6 minutes while it takes about 2 seconds with this patch applied. This
> actually happens in a real world load when nfs pushes data into a directory
> which is over project quota.
>
> Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> from exceeded project quota (not requiring flushing).
>
> As a side effect this patch fixes inconsistency where e.g. xfs_create()
> returned EDQUOT even when project quota was exceeded.
  Ping? Any opinions?

                                                                Honza

>
> Signed-off-by: Jan Kara <[hidden email]>
> ---
>  fs/xfs/xfs_bmap.c        |    3 ++-
>  fs/xfs/xfs_iomap.c       |    8 ++++++--
>  fs/xfs/xfs_linux.h       |    1 +
>  fs/xfs/xfs_qm.c          |   13 +++++--------
>  fs/xfs/xfs_quota.h       |    2 +-
>  fs/xfs/xfs_trans_dquot.c |   18 +++++++++---------
>  6 files changed, 24 insertions(+), 21 deletions(-)
>
> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> index 848ffa7..027398f 100644
> --- a/fs/xfs/xfs_bmap.c
> +++ b/fs/xfs/xfs_bmap.c
> @@ -4471,7 +4471,8 @@ xfs_bmapi_reserve_delalloc(
>   * allocated blocks already inside this loop.
>   */
>   error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
> - rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
> + XFS_QMOPT_EPDQUOT |
> + (rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS));
>   if (error)
>   return error;
>  
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 973dff6..86e8016 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -428,6 +428,7 @@ retry:
>   case 0:
>   case ENOSPC:
>   case EDQUOT:
> + case EPDQUOT:
>   break;
>   default:
>   return XFS_ERROR(error);
> @@ -441,8 +442,11 @@ retry:
>   */
>   if (nimaps == 0) {
>   trace_xfs_delalloc_enospc(ip, offset, count);
> - if (flushed)
> - return XFS_ERROR(error ? error : ENOSPC);
> + if (flushed) {
> + if (error == 0 || error == EPDQUOT)
> + error = ENOSPC;
> + return XFS_ERROR(error);
> + }
>  
>   if (error == ENOSPC) {
>   xfs_iunlock(ip, XFS_ILOCK_EXCL);
> diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> index 828662f..31368df 100644
> --- a/fs/xfs/xfs_linux.h
> +++ b/fs/xfs/xfs_linux.h
> @@ -145,6 +145,7 @@
>  #define ENOATTR ENODATA /* Attribute not found */
>  #define EWRONGFS EINVAL /* Mount with wrong filesystem type */
>  #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
> +#define EPDQUOT EXFULL /* Project quota exceeded */
>  
>  #define SYNCHRONIZE() barrier()
>  #define __return_address __builtin_return_address(0)
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index 2e86fa0..99ace03 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1790,7 +1790,7 @@ xfs_qm_vop_chown_reserve(
>   uint flags)
>  {
>   xfs_mount_t *mp = ip->i_mount;
> - uint delblks, blkflags, prjflags = 0;
> + uint delblks, blkflags;
>   xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
>   int error;
>  
> @@ -1817,11 +1817,8 @@ xfs_qm_vop_chown_reserve(
>   }
>   }
>   if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
> - if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
> -     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
> - prjflags = XFS_QMOPT_ENOSPC;
> -
> - if (prjflags ||
> + if ((XFS_IS_PQUOTA_ON(ip->i_mount) &&
> +     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) ||
>      (XFS_IS_GQUOTA_ON(ip->i_mount) &&
>       ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
>   delblksgdq = gdqp;
> @@ -1834,7 +1831,7 @@ xfs_qm_vop_chown_reserve(
>  
>   if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
>   delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
> - flags | blkflags | prjflags)))
> + flags | blkflags)))
>   return (error);
>  
>   /*
> @@ -1851,7 +1848,7 @@ xfs_qm_vop_chown_reserve(
>   ASSERT(unresudq || unresgdq);
>   if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
>   delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
> - flags | blkflags | prjflags)))
> + flags | blkflags)))
>   return (error);
>   xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
>   unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
> diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
> index b50ec5b..264b455 100644
> --- a/fs/xfs/xfs_quota.h
> +++ b/fs/xfs/xfs_quota.h
> @@ -203,7 +203,7 @@ typedef struct xfs_qoff_logformat {
>  #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
>  #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
>  #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
> -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
> +#define XFS_QMOPT_EPDQUOT 0x0004000 /* return EPDQUOT when project quota exceeded */
>  
>  /*
>   * flags to xfs_trans_mod_dquot to indicate which field needs to be
> diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
> index 0c7fa54..27acce3 100644
> --- a/fs/xfs/xfs_trans_dquot.c
> +++ b/fs/xfs/xfs_trans_dquot.c
> @@ -728,8 +728,6 @@ xfs_trans_dqresv(
>  
>  error_return:
>   xfs_dqunlock(dqp);
> - if (flags & XFS_QMOPT_ENOSPC)
> - return ENOSPC;
>   return EDQUOT;
>  }
>  
> @@ -741,7 +739,6 @@ error_return:
>   * approach.
>   *
>   * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
> - *   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
>   *   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
>   *   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
>   * dquots are unlocked on return, if they were not locked by caller.
> @@ -767,8 +764,7 @@ xfs_trans_reserve_quota_bydquots(
>   ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
>  
>   if (udqp) {
> - error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
> - (flags & ~XFS_QMOPT_ENOSPC));
> + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
>   if (error)
>   return error;
>   resvd = 1;
> @@ -785,6 +781,12 @@ xfs_trans_reserve_quota_bydquots(
>   xfs_trans_dqresv(tp, mp, udqp,
>   -nblks, -ninos, flags);
>   }
> + if (error == EDQUOT && XFS_IS_PQUOTA_ON(mp)) {
> + if (flags & XFS_QMOPT_EPDQUOT)
> + error = EPDQUOT;
> + else
> + error = ENOSPC;
> + }
>   return error;
>   }
>   }
> @@ -813,16 +815,14 @@ xfs_trans_reserve_quota_nblks(
>  
>   if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
>   return 0;
> - if (XFS_IS_PQUOTA_ON(mp))
> - flags |= XFS_QMOPT_ENOSPC;
>  
>   ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
>   ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
>  
>   ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
> + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_EPDQUOT)) ==
>   XFS_TRANS_DQ_RES_RTBLKS ||
> -       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
> +       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_EPDQUOT)) ==
>   XFS_TRANS_DQ_RES_BLKS);
>  
>   /*
> --
> 1.7.1
>
--
Jan Kara <[hidden email]>
SUSE Labs, CR

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Dave Chinner
On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:

> On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > This makes handling of writes over project quota rather slow as a simple test
> > program shows:
> > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > for (i = 0; i < 50000; i++)
> > pwrite(fd, buf, 4096, i*4096);
> >
> > Writing 200 MB like this into a directory with 100 MB project quota takes
> > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > actually happens in a real world load when nfs pushes data into a directory
> > which is over project quota.
> >
> > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > from exceeded project quota (not requiring flushing).
> >
> > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > returned EDQUOT even when project quota was exceeded.
>   Ping? Any opinions?

Busy, haven't had a chance to look at it.

Cheers,

Dave.
--
Dave Chinner
[hidden email]

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Dave Chinner
In reply to this post by Jan Kara
On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:

> On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > This makes handling of writes over project quota rather slow as a simple test
> > program shows:
> > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > for (i = 0; i < 50000; i++)
> > pwrite(fd, buf, 4096, i*4096);
> >
> > Writing 200 MB like this into a directory with 100 MB project quota takes
> > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > actually happens in a real world load when nfs pushes data into a directory
> > which is over project quota.
> >
> > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > from exceeded project quota (not requiring flushing).
> >
> > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > returned EDQUOT even when project quota was exceeded.
>   Ping? Any opinions?

FWIW, it doesn't look like it'll apply to a current XFs tree:

> > @@ -441,8 +442,11 @@ retry:
> >   */
> >   if (nimaps == 0) {
> >   trace_xfs_delalloc_enospc(ip, offset, count);
> > - if (flushed)
> > - return XFS_ERROR(error ? error : ENOSPC);
> > + if (flushed) {
> > + if (error == 0 || error == EPDQUOT)
> > + error = ENOSPC;
> > + return XFS_ERROR(error);
> > + }
> >  
> >   if (error == ENOSPC) {
> >   xfs_iunlock(ip, XFS_ILOCK_EXCL);

This xfs_iomap_write_delay() looks like this now:

        /*
         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (prealloc) {
                        prealloc = 0;
                        error = 0;
                        goto retry;
                }
                return XFS_ERROR(error ? error : ENOSPC);
        }

The flushing is now way up in xfs_file_buffered_aio_write(), and the
implementation of xfs_flush_inodes() has changed as well. Hence it
may or may not behave differently not....

Cheers,

Dave.
--
Dave Chinner
[hidden email]

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Ben Myers
In reply to this post by Jan Kara
Hi Jan,

On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:

> On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > This makes handling of writes over project quota rather slow as a simple test
> > program shows:
> > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > for (i = 0; i < 50000; i++)
> > pwrite(fd, buf, 4096, i*4096);
> >
> > Writing 200 MB like this into a directory with 100 MB project quota takes
> > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > actually happens in a real world load when nfs pushes data into a directory
> > which is over project quota.
> >
> > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > from exceeded project quota (not requiring flushing).
> >
> > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > returned EDQUOT even when project quota was exceeded.
>   Ping? Any opinions?

I think that there may be good reason to flush inodes even in the project quota
case.  Speculative allocation beyond EOF might need to be cleaned up.  I'm all
for passing back some data about why we hit ENOSPC.  Then we can combine this
with Brian Foster's work and flush only inodes that touch a given project,
user, or group quota.

-Ben


>
> Honza
> >
> > Signed-off-by: Jan Kara <[hidden email]>
> > ---
> >  fs/xfs/xfs_bmap.c        |    3 ++-
> >  fs/xfs/xfs_iomap.c       |    8 ++++++--
> >  fs/xfs/xfs_linux.h       |    1 +
> >  fs/xfs/xfs_qm.c          |   13 +++++--------
> >  fs/xfs/xfs_quota.h       |    2 +-
> >  fs/xfs/xfs_trans_dquot.c |   18 +++++++++---------
> >  6 files changed, 24 insertions(+), 21 deletions(-)
> >
> > diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> > index 848ffa7..027398f 100644
> > --- a/fs/xfs/xfs_bmap.c
> > +++ b/fs/xfs/xfs_bmap.c
> > @@ -4471,7 +4471,8 @@ xfs_bmapi_reserve_delalloc(
> >   * allocated blocks already inside this loop.
> >   */
> >   error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
> > - rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
> > + XFS_QMOPT_EPDQUOT |
> > + (rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS));
> >   if (error)
> >   return error;
> >  
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 973dff6..86e8016 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -428,6 +428,7 @@ retry:
> >   case 0:
> >   case ENOSPC:
> >   case EDQUOT:
> > + case EPDQUOT:
> >   break;
> >   default:
> >   return XFS_ERROR(error);
> > @@ -441,8 +442,11 @@ retry:
> >   */
> >   if (nimaps == 0) {
> >   trace_xfs_delalloc_enospc(ip, offset, count);
> > - if (flushed)
> > - return XFS_ERROR(error ? error : ENOSPC);
> > + if (flushed) {
> > + if (error == 0 || error == EPDQUOT)
> > + error = ENOSPC;
> > + return XFS_ERROR(error);
> > + }
> >  
> >   if (error == ENOSPC) {
> >   xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> > index 828662f..31368df 100644
> > --- a/fs/xfs/xfs_linux.h
> > +++ b/fs/xfs/xfs_linux.h
> > @@ -145,6 +145,7 @@
> >  #define ENOATTR ENODATA /* Attribute not found */
> >  #define EWRONGFS EINVAL /* Mount with wrong filesystem type */
> >  #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
> > +#define EPDQUOT EXFULL /* Project quota exceeded */
> >  
> >  #define SYNCHRONIZE() barrier()
> >  #define __return_address __builtin_return_address(0)
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index 2e86fa0..99ace03 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> > @@ -1790,7 +1790,7 @@ xfs_qm_vop_chown_reserve(
> >   uint flags)
> >  {
> >   xfs_mount_t *mp = ip->i_mount;
> > - uint delblks, blkflags, prjflags = 0;
> > + uint delblks, blkflags;
> >   xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
> >   int error;
> >  
> > @@ -1817,11 +1817,8 @@ xfs_qm_vop_chown_reserve(
> >   }
> >   }
> >   if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
> > - if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
> > -     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
> > - prjflags = XFS_QMOPT_ENOSPC;
> > -
> > - if (prjflags ||
> > + if ((XFS_IS_PQUOTA_ON(ip->i_mount) &&
> > +     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) ||
> >      (XFS_IS_GQUOTA_ON(ip->i_mount) &&
> >       ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
> >   delblksgdq = gdqp;
> > @@ -1834,7 +1831,7 @@ xfs_qm_vop_chown_reserve(
> >  
> >   if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
> >   delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
> > - flags | blkflags | prjflags)))
> > + flags | blkflags)))
> >   return (error);
> >  
> >   /*
> > @@ -1851,7 +1848,7 @@ xfs_qm_vop_chown_reserve(
> >   ASSERT(unresudq || unresgdq);
> >   if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
> >   delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
> > - flags | blkflags | prjflags)))
> > + flags | blkflags)))
> >   return (error);
> >   xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
> >   unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
> > diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
> > index b50ec5b..264b455 100644
> > --- a/fs/xfs/xfs_quota.h
> > +++ b/fs/xfs/xfs_quota.h
> > @@ -203,7 +203,7 @@ typedef struct xfs_qoff_logformat {
> >  #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
> >  #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
> >  #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
> > -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
> > +#define XFS_QMOPT_EPDQUOT 0x0004000 /* return EPDQUOT when project quota exceeded */
> >  
> >  /*
> >   * flags to xfs_trans_mod_dquot to indicate which field needs to be
> > diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
> > index 0c7fa54..27acce3 100644
> > --- a/fs/xfs/xfs_trans_dquot.c
> > +++ b/fs/xfs/xfs_trans_dquot.c
> > @@ -728,8 +728,6 @@ xfs_trans_dqresv(
> >  
> >  error_return:
> >   xfs_dqunlock(dqp);
> > - if (flags & XFS_QMOPT_ENOSPC)
> > - return ENOSPC;
> >   return EDQUOT;
> >  }
> >  
> > @@ -741,7 +739,6 @@ error_return:
> >   * approach.
> >   *
> >   * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
> > - *   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
> >   *   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
> >   *   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
> >   * dquots are unlocked on return, if they were not locked by caller.
> > @@ -767,8 +764,7 @@ xfs_trans_reserve_quota_bydquots(
> >   ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
> >  
> >   if (udqp) {
> > - error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
> > - (flags & ~XFS_QMOPT_ENOSPC));
> > + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
> >   if (error)
> >   return error;
> >   resvd = 1;
> > @@ -785,6 +781,12 @@ xfs_trans_reserve_quota_bydquots(
> >   xfs_trans_dqresv(tp, mp, udqp,
> >   -nblks, -ninos, flags);
> >   }
> > + if (error == EDQUOT && XFS_IS_PQUOTA_ON(mp)) {
> > + if (flags & XFS_QMOPT_EPDQUOT)
> > + error = EPDQUOT;
> > + else
> > + error = ENOSPC;
> > + }
> >   return error;
> >   }
> >   }
> > @@ -813,16 +815,14 @@ xfs_trans_reserve_quota_nblks(
> >  
> >   if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
> >   return 0;
> > - if (XFS_IS_PQUOTA_ON(mp))
> > - flags |= XFS_QMOPT_ENOSPC;
> >  
> >   ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
> >   ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
> >  
> >   ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> > - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
> > + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_EPDQUOT)) ==
> >   XFS_TRANS_DQ_RES_RTBLKS ||
> > -       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
> > +       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_EPDQUOT)) ==
> >   XFS_TRANS_DQ_RES_BLKS);
> >  
> >   /*
> > --
> > 1.7.1
> >
> --
> Jan Kara <[hidden email]>
> SUSE Labs, CR
>
> _______________________________________________
> xfs mailing list
> [hidden email]
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Jan Kara
On Tue 20-11-12 10:15:11, Ben Myers wrote:

> Hi Jan,
>
> On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
> > On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > > This makes handling of writes over project quota rather slow as a simple test
> > > program shows:
> > > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > > for (i = 0; i < 50000; i++)
> > > pwrite(fd, buf, 4096, i*4096);
> > >
> > > Writing 200 MB like this into a directory with 100 MB project quota takes
> > > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > > actually happens in a real world load when nfs pushes data into a directory
> > > which is over project quota.
> > >
> > > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > > from exceeded project quota (not requiring flushing).
> > >
> > > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > > returned EDQUOT even when project quota was exceeded.
> >   Ping? Any opinions?
>
> I think that there may be good reason to flush inodes even in the project quota
> case.  Speculative allocation beyond EOF might need to be cleaned up.  I'm all
> for passing back some data about why we hit ENOSPC.  Then we can combine this
> with Brian Foster's work and flush only inodes that touch a given project,
> user, or group quota.
  Yes, I agree flushing might be useful even for project quota but then why
don't we flush inodes also for user quota? Also the performance impact
is really huge - and here I agree that unless you are writing over NFS you
won't notice because only NFS tries to push X MB to the filesystem page by
page only to get ENOSPC each time... And NFS is arguably doing a stupid
thing but it is a common setup and you don't always have the freedom to fix
clients to be more clever. So I'd be happy if XFS accomodated such use.

                                                                Honza
--
Jan Kara <[hidden email]>
SUSE Labs, CR

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Dave Chinner
On Tue, Nov 20, 2012 at 06:03:54PM +0100, Jan Kara wrote:

> On Tue 20-11-12 10:15:11, Ben Myers wrote:
> > Hi Jan,
> >
> > On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
> > > On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > > > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > > > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > > > This makes handling of writes over project quota rather slow as a simple test
> > > > program shows:
> > > > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > > > for (i = 0; i < 50000; i++)
> > > > pwrite(fd, buf, 4096, i*4096);
> > > >
> > > > Writing 200 MB like this into a directory with 100 MB project quota takes
> > > > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > > > actually happens in a real world load when nfs pushes data into a directory
> > > > which is over project quota.
> > > >
> > > > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > > > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > > > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > > > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > > > from exceeded project quota (not requiring flushing).
> > > >
> > > > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > > > returned EDQUOT even when project quota was exceeded.
> > >   Ping? Any opinions?
> >
> > I think that there may be good reason to flush inodes even in the project quota
> > case.  Speculative allocation beyond EOF might need to be cleaned up.

The flushing at ENOSPC doesn't clear up speculative preallocation.
It writes back data which releases metadata reservations that
delalloc extents hold in addition to the data. The reservations are
held so that tree splits during allocation have block reserved and
we don't ENOSPC on delalloc all the time.

> >
> > I'm all
> > for passing back some data about why we hit ENOSPC.  Then we can combine this
> > with Brian Foster's work and flush only inodes that touch a given project,
> > user, or group quota.

Brian had more patches that throttled specualtive prealloc when
quota got low, as well as triggered a specific specualtive
allocation trimming passes when EDQUOT is hit. This will remove the
global inode flush from the project uota cae when it is done.

>   Yes, I agree flushing might be useful even for project quota but then why
> don't we flush inodes also for user quota?

It's by design. Directory tree quota is used as a method of
exporting multiple sub-dirs from a single filesystem but having them
appear to NFS clients just like a standalone filesystem. Hence when
you run out of projet quota, it is treated like an ENOSPC condition
for the directory sub-tree - it flushes as much of the metadata
reservations out as possible to maximise the data space for the
directory tree.

For user/group quotas, this requirement of behaving like a
standalone filesystem does not exist, and so when you EDQUOT a
user/group there is no need to reclaim metadata reservations to make
more data space available....

> Also the performance impact
> is really huge - and here I agree that unless you are writing over NFS you
> won't notice because only NFS tries to push X MB to the filesystem page by
> page only to get ENOSPC each time...

The problem is the speculative allocation can trigger this behaviour
prematurely and repeatedly. That's where Brians prealloc throttle
patches come in - it reduces the occurrence of ENOSPC as the quota
limit is approached, and hence reduces the number of inode flushes.

> And NFS is arguably doing a stupid
> thing but it is a common setup and you don't always have the freedom to fix
> clients to be more clever. So I'd be happy if XFS accomodated such use.

Sure, but there's more to it than just avoiding the inode flush,
unfortunately...

Cheers,

Dave.
--
Dave Chinner
[hidden email]

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Brian Foster-7
On 11/20/2012 03:20 PM, Dave Chinner wrote:
> On Tue, Nov 20, 2012 at 06:03:54PM +0100, Jan Kara wrote:
>> On Tue 20-11-12 10:15:11, Ben Myers wrote:
>>> Hi Jan,
>>>
>>> On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
>>>> On Tue 13-11-12 01:36:13, Jan Kara wrote:
...

>>>
>>> I think that there may be good reason to flush inodes even in the project quota
>>> case.  Speculative allocation beyond EOF might need to be cleaned up.
>
> The flushing at ENOSPC doesn't clear up speculative preallocation.
> It writes back data which releases metadata reservations that
> delalloc extents hold in addition to the data. The reservations are
> held so that tree splits during allocation have block reserved and
> we don't ENOSPC on delalloc all the time.
>

So does this apply to EDQUOT just the same as ENOSPC? In other words,
must we always do the flush, or can we replace the flush in EDQUOT
situations with a targeted eofblocks scan and retry?

Are these metadata reservations accounted against the particular quota?

>>>
>>> I'm all
>>> for passing back some data about why we hit ENOSPC.  Then we can combine this
>>> with Brian Foster's work and flush only inodes that touch a given project,
>>> user, or group quota.
>
> Brian had more patches that throttled specualtive prealloc when
> quota got low, as well as triggered a specific specualtive
> allocation trimming passes when EDQUOT is hit. This will remove the
> global inode flush from the project uota cae when it is done.
>

Yes, I need to resurrect those throttling patches as my next order of
business. They never had the eofblocks bits originally but it seems like
a logical add on at this point.

A point of confusion for me... above you reference removing the global
inode flush from the project quota case, which I'm taking as we can do
some kind of eofblocks scan here...

>>   Yes, I agree flushing might be useful even for project quota but then why
>> don't we flush inodes also for user quota?
>
> It's by design. Directory tree quota is used as a method of
> exporting multiple sub-dirs from a single filesystem but having them
> appear to NFS clients just like a standalone filesystem. Hence when
> you run out of projet quota, it is treated like an ENOSPC condition
> for the directory sub-tree - it flushes as much of the metadata
> reservations out as possible to maximise the data space for the
> directory tree.
>
> For user/group quotas, this requirement of behaving like a
> standalone filesystem does not exist, and so when you EDQUOT a
> user/group there is no need to reclaim metadata reservations to make
> more data space available....
>

... but the metadata reservation issue discussed here sounds like it
could still be a problem. Is the implication that we could still do a
project quota filtered eofblocks scan, but it must also (or instead, if
a flush implies trimming post-eof space) include an inode flush in the
project quota case?

Otherwise, unless I'm mistaken it sounds like we can use the existing
eofblocks scan on user/group EDQUOT situations.

>> Also the performance impact
>> is really huge - and here I agree that unless you are writing over NFS you
>> won't notice because only NFS tries to push X MB to the filesystem page by
>> page only to get ENOSPC each time...
>
> The problem is the speculative allocation can trigger this behaviour
> prematurely and repeatedly. That's where Brians prealloc throttle
> patches come in - it reduces the occurrence of ENOSPC as the quota
> limit is approached, and hence reduces the number of inode flushes.
>

I'll include this use case in my testing when I get back to those patches.

Brian

>> And NFS is arguably doing a stupid
>> thing but it is a common setup and you don't always have the freedom to fix
>> clients to be more clever. So I'd be happy if XFS accomodated such use.
>
> Sure, but there's more to it than just avoiding the inode flush,
> unfortunately...
>
> Cheers,
>
> Dave.
>

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Dave Chinner
On Tue, Nov 20, 2012 at 04:25:34PM -0500, Brian Foster wrote:

> On 11/20/2012 03:20 PM, Dave Chinner wrote:
> > On Tue, Nov 20, 2012 at 06:03:54PM +0100, Jan Kara wrote:
> >> On Tue 20-11-12 10:15:11, Ben Myers wrote:
> >>> Hi Jan,
> >>>
> >>> On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
> >>>> On Tue 13-11-12 01:36:13, Jan Kara wrote:
> ...
> >>>
> >>> I think that there may be good reason to flush inodes even in the project quota
> >>> case.  Speculative allocation beyond EOF might need to be cleaned up.
> >
> > The flushing at ENOSPC doesn't clear up speculative preallocation.
> > It writes back data which releases metadata reservations that
> > delalloc extents hold in addition to the data. The reservations are
> > held so that tree splits during allocation have block reserved and
> > we don't ENOSPC on delalloc all the time.
> >
>
> So does this apply to EDQUOT just the same as ENOSPC?

We don't apply it to edquot because the filesystem still has plenty
of data space available, hence removing the metadata reservations is
not necessary to prevent premature ENOSPC errors. The metadata
reservation is correctly attributed to the quota, so no flushing on
EDQUOT is fine...

> In other words,
> must we always do the flush, or can we replace the flush in EDQUOT
> situations with a targeted eofblocks scan and retry?

There is no flush for EDQUOT right now but we need to add a targeted
eofblocks scan and retry on EDQUOT.

> Are these metadata reservations accounted against the particular quota?

Yes.

> >>> I'm all
> >>> for passing back some data about why we hit ENOSPC.  Then we can combine this
> >>> with Brian Foster's work and flush only inodes that touch a given project,
> >>> user, or group quota.
> >
> > Brian had more patches that throttled specualtive prealloc when
> > quota got low, as well as triggered a specific specualtive
> > allocation trimming passes when EDQUOT is hit. This will remove the
> > global inode flush from the project uota cae when it is done.
>
> Yes, I need to resurrect those throttling patches as my next order of
> business. They never had the eofblocks bits originally but it seems like
> a logical add on at this point.

*nod*

> A point of confusion for me... above you reference removing the global
> inode flush from the project quota case, which I'm taking as we can do
> some kind of eofblocks scan here...

Yes, the only inodes that will have reserved metadata are those with
outstanding delayed allocation, and those are also the ones that
will have outstanding specualtive preallocation....

> >>   Yes, I agree flushing might be useful even for project quota but then why
> >> don't we flush inodes also for user quota?
> >
> > It's by design. Directory tree quota is used as a method of
> > exporting multiple sub-dirs from a single filesystem but having them
> > appear to NFS clients just like a standalone filesystem. Hence when
> > you run out of projet quota, it is treated like an ENOSPC condition
> > for the directory sub-tree - it flushes as much of the metadata
> > reservations out as possible to maximise the data space for the
> > directory tree.

And FWIW, returning ENOSPC to the NFS client inthese situations is
the correct error to be returning as they know nothing about the
fact that project quotas are used on the server to limit the size of
the export the client has mounted.

> > For user/group quotas, this requirement of behaving like a
> > standalone filesystem does not exist, and so when you EDQUOT a
> > user/group there is no need to reclaim metadata reservations to make
> > more data space available....
>
> ... but the metadata reservation issue discussed here sounds like it
> could still be a problem. Is the implication that we could still do a
> project quota filtered eofblocks scan, but it must also (or instead, if
> a flush implies trimming post-eof space) include an inode flush in the
> project quota case?

It's never been reported as a problem. In the absence of problems
being reported over the past 10+ years, I don't think we should be
changing the behaviour of inode flushing at EDQUOT. Trimming
speculative prealloc, yes, but I don't think flushing inodes is
necessary.

> Otherwise, unless I'm mistaken it sounds like we can use the existing
> eofblocks scan on user/group EDQUOT situations.

That we can. And for the project case, it's a simply and extra flag
and a call to filemap_flush() to do an async writeback before
trimming the specualtive preallocation.

> >> Also the performance impact
> >> is really huge - and here I agree that unless you are writing over NFS you
> >> won't notice because only NFS tries to push X MB to the filesystem page by
> >> page only to get ENOSPC each time...
> >
> > The problem is the speculative allocation can trigger this behaviour
> > prematurely and repeatedly. That's where Brians prealloc throttle
> > patches come in - it reduces the occurrence of ENOSPC as the quota
> > limit is approached, and hence reduces the number of inode flushes.
> >
>
> I'll include this use case in my testing when I get back to those patches.

Cool.

Cheers,

Dave.
--
Dave Chinner
[hidden email]

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Jan Kara
In reply to this post by Dave Chinner
On Tue 20-11-12 11:04:28, Dave Chinner wrote:

> On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
> > On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > > This makes handling of writes over project quota rather slow as a simple test
> > > program shows:
> > > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > > for (i = 0; i < 50000; i++)
> > > pwrite(fd, buf, 4096, i*4096);
> > >
> > > Writing 200 MB like this into a directory with 100 MB project quota takes
> > > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > > actually happens in a real world load when nfs pushes data into a directory
> > > which is over project quota.
> > >
> > > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > > from exceeded project quota (not requiring flushing).
> > >
> > > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > > returned EDQUOT even when project quota was exceeded.
> >   Ping? Any opinions?
>
> FWIW, it doesn't look like it'll apply to a current XFs tree:
>
> > > @@ -441,8 +442,11 @@ retry:
> > >   */
> > >   if (nimaps == 0) {
> > >   trace_xfs_delalloc_enospc(ip, offset, count);
> > > - if (flushed)
> > > - return XFS_ERROR(error ? error : ENOSPC);
> > > + if (flushed) {
> > > + if (error == 0 || error == EPDQUOT)
> > > + error = ENOSPC;
> > > + return XFS_ERROR(error);
> > > + }
> > >  
> > >   if (error == ENOSPC) {
> > >   xfs_iunlock(ip, XFS_ILOCK_EXCL);
>
> This xfs_iomap_write_delay() looks like this now:
>
>         /*
>          * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
>          * without EOF preallocation.
>          */
>         if (nimaps == 0) {
>                 trace_xfs_delalloc_enospc(ip, offset, count);
>                 if (prealloc) {
>                         prealloc = 0;
>                         error = 0;
>                         goto retry;
>                 }
>                 return XFS_ERROR(error ? error : ENOSPC);
>         }
>
> The flushing is now way up in xfs_file_buffered_aio_write(), and the
> implementation of xfs_flush_inodes() has changed as well. Hence it
> may or may not behave differently not....
  OK, so I tested latest XFS tree and changes by commit 9aa05000 (changing
xfs_flush_inodes()) indeed improve the performace from those ~6 minutes to
~6 seconds which is good enough I believe. Thanks for the pointer! I was
thinking for a while why sync_inodes_sb() is so much faster than the
original XFS implementation and I believe it's because we don't force the
log on each sync now.

                                                                Honza
--
Jan Kara <[hidden email]>
SUSE Labs, CR

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Dave Chinner
On Wed, Nov 21, 2012 at 01:24:59AM +0100, Jan Kara wrote:

> On Tue 20-11-12 11:04:28, Dave Chinner wrote:
> > On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
> > > On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > > > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > > > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > > > This makes handling of writes over project quota rather slow as a simple test
> > > > program shows:
> > > > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > > > for (i = 0; i < 50000; i++)
> > > > pwrite(fd, buf, 4096, i*4096);
> > > >
> > > > Writing 200 MB like this into a directory with 100 MB project quota takes
> > > > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > > > actually happens in a real world load when nfs pushes data into a directory
> > > > which is over project quota.
> > > >
> > > > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > > > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > > > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > > > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > > > from exceeded project quota (not requiring flushing).
> > > >
> > > > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > > > returned EDQUOT even when project quota was exceeded.
> > >   Ping? Any opinions?
> >
> > FWIW, it doesn't look like it'll apply to a current XFs tree:
> >
> > > > @@ -441,8 +442,11 @@ retry:
> > > >   */
> > > >   if (nimaps == 0) {
> > > >   trace_xfs_delalloc_enospc(ip, offset, count);
> > > > - if (flushed)
> > > > - return XFS_ERROR(error ? error : ENOSPC);
> > > > + if (flushed) {
> > > > + if (error == 0 || error == EPDQUOT)
> > > > + error = ENOSPC;
> > > > + return XFS_ERROR(error);
> > > > + }
> > > >  
> > > >   if (error == ENOSPC) {
> > > >   xfs_iunlock(ip, XFS_ILOCK_EXCL);
> >
> > This xfs_iomap_write_delay() looks like this now:
> >
> >         /*
> >          * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
> >          * without EOF preallocation.
> >          */
> >         if (nimaps == 0) {
> >                 trace_xfs_delalloc_enospc(ip, offset, count);
> >                 if (prealloc) {
> >                         prealloc = 0;
> >                         error = 0;
> >                         goto retry;
> >                 }
> >                 return XFS_ERROR(error ? error : ENOSPC);
> >         }
> >
> > The flushing is now way up in xfs_file_buffered_aio_write(), and the
> > implementation of xfs_flush_inodes() has changed as well. Hence it
> > may or may not behave differently not....
>   OK, so I tested latest XFS tree and changes by commit 9aa05000 (changing
> xfs_flush_inodes()) indeed improve the performace from those ~6 minutes to
> ~6 seconds which is good enough I believe. Thanks for the pointer! I was
> thinking for a while why sync_inodes_sb() is so much faster than the
> original XFS implementation and I believe it's because we don't force the
> log on each sync now.

I think it's detter because we now don't scan every inode in the
inode cache doing a mapping_tagged(PAGECACHE_TAG_DIRTY) check to see
if they are dirty or not to decide whether it needs writeback. The
overhead of doing that adds up very quickly when you have lots of
cached inodes and you are scanning them for every page that is
written to....

Cheers,

Dave.
--
Dave Chinner
[hidden email]

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Jan Kara
On Wed 21-11-12 12:38:21, Dave Chinner wrote:

> On Wed, Nov 21, 2012 at 01:24:59AM +0100, Jan Kara wrote:
> > On Tue 20-11-12 11:04:28, Dave Chinner wrote:
> > > On Mon, Nov 19, 2012 at 10:39:13PM +0100, Jan Kara wrote:
> > > > On Tue 13-11-12 01:36:13, Jan Kara wrote:
> > > > > When project quota gets exceeded xfs_iomap_write_delay() ends up flushing
> > > > > inodes because ENOSPC gets returned from xfs_bmapi_delay() instead of EDQUOT.
> > > > > This makes handling of writes over project quota rather slow as a simple test
> > > > > program shows:
> > > > > fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
> > > > > for (i = 0; i < 50000; i++)
> > > > > pwrite(fd, buf, 4096, i*4096);
> > > > >
> > > > > Writing 200 MB like this into a directory with 100 MB project quota takes
> > > > > around 6 minutes while it takes about 2 seconds with this patch applied. This
> > > > > actually happens in a real world load when nfs pushes data into a directory
> > > > > which is over project quota.
> > > > >
> > > > > Fix the problem by replacing XFS_QMOPT_ENOSPC flag with XFS_QMOPT_EPDQUOT.
> > > > > That makes xfs_trans_reserve_quota_bydquots() return new error EPDQUOT when
> > > > > project quota is exceeded. xfs_bmapi_delay() then uses this flag so that
> > > > > xfs_iomap_write_delay() can distinguish real ENOSPC (requiring flushing)
> > > > > from exceeded project quota (not requiring flushing).
> > > > >
> > > > > As a side effect this patch fixes inconsistency where e.g. xfs_create()
> > > > > returned EDQUOT even when project quota was exceeded.
> > > >   Ping? Any opinions?
> > >
> > > FWIW, it doesn't look like it'll apply to a current XFs tree:
> > >
> > > > > @@ -441,8 +442,11 @@ retry:
> > > > >   */
> > > > >   if (nimaps == 0) {
> > > > >   trace_xfs_delalloc_enospc(ip, offset, count);
> > > > > - if (flushed)
> > > > > - return XFS_ERROR(error ? error : ENOSPC);
> > > > > + if (flushed) {
> > > > > + if (error == 0 || error == EPDQUOT)
> > > > > + error = ENOSPC;
> > > > > + return XFS_ERROR(error);
> > > > > + }
> > > > >  
> > > > >   if (error == ENOSPC) {
> > > > >   xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > >
> > > This xfs_iomap_write_delay() looks like this now:
> > >
> > >         /*
> > >          * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
> > >          * without EOF preallocation.
> > >          */
> > >         if (nimaps == 0) {
> > >                 trace_xfs_delalloc_enospc(ip, offset, count);
> > >                 if (prealloc) {
> > >                         prealloc = 0;
> > >                         error = 0;
> > >                         goto retry;
> > >                 }
> > >                 return XFS_ERROR(error ? error : ENOSPC);
> > >         }
> > >
> > > The flushing is now way up in xfs_file_buffered_aio_write(), and the
> > > implementation of xfs_flush_inodes() has changed as well. Hence it
> > > may or may not behave differently not....
> >   OK, so I tested latest XFS tree and changes by commit 9aa05000 (changing
> > xfs_flush_inodes()) indeed improve the performace from those ~6 minutes to
> > ~6 seconds which is good enough I believe. Thanks for the pointer! I was
> > thinking for a while why sync_inodes_sb() is so much faster than the
> > original XFS implementation and I believe it's because we don't force the
> > log on each sync now.
>
> I think it's detter because we now don't scan every inode in the
> inode cache doing a mapping_tagged(PAGECACHE_TAG_DIRTY) check to see
> if they are dirty or not to decide whether it needs writeback. The
> overhead of doing that adds up very quickly when you have lots of
> cached inodes and you are scanning them for every page that is
> written to....
  That may be for other workloads but in my case I had a test case where
I'm pretty sure only a couple of inodes were in the cache (I just mounted a
filesystem and beated one file in one directory on it).

                                                                Honza
--
Jan Kara <[hidden email]>
SUSE Labs, CR

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Brian Foster-7
In reply to this post by Dave Chinner
On 11/20/2012 05:22 PM, Dave Chinner wrote:
> On Tue, Nov 20, 2012 at 04:25:34PM -0500, Brian Foster wrote:
>> On 11/20/2012 03:20 PM, Dave Chinner wrote:
...

>>> It's by design. Directory tree quota is used as a method of
>>> exporting multiple sub-dirs from a single filesystem but having them
>>> appear to NFS clients just like a standalone filesystem. Hence when
>>> you run out of projet quota, it is treated like an ENOSPC condition
>>> for the directory sub-tree - it flushes as much of the metadata
>>> reservations out as possible to maximise the data space for the
>>> directory tree.
>
> And FWIW, returning ENOSPC to the NFS client inthese situations is
> the correct error to be returning as they know nothing about the
> fact that project quotas are used on the server to limit the size of
> the export the client has mounted.
>

Ok, I initially missed that we returned ENOSPC in the project quota
situation, so I might have mixed up what I was referring to as EDQUOT
vs. ENOSPC. I see the XFS_QMOPT_ENOSPC logic that appears to cause that
behavior.

>>> For user/group quotas, this requirement of behaving like a
>>> standalone filesystem does not exist, and so when you EDQUOT a
>>> user/group there is no need to reclaim metadata reservations to make
>>> more data space available....
>>
>> ... but the metadata reservation issue discussed here sounds like it
>> could still be a problem. Is the implication that we could still do a
>> project quota filtered eofblocks scan, but it must also (or instead, if
>> a flush implies trimming post-eof space) include an inode flush in the
>> project quota case?
>
> It's never been reported as a problem. In the absence of problems
> being reported over the past 10+ years, I don't think we should be
> changing the behaviour of inode flushing at EDQUOT. Trimming
> speculative prealloc, yes, but I don't think flushing inodes is
> necessary.
>

I was unclear. By problem, I just meant that replacing the inode flush
after a project quota ENOSPC with an eofblocks scan would be a problem
(e.g., a flush + eofblocks trim is the right approach, as you outline
below).

>> Otherwise, unless I'm mistaken it sounds like we can use the existing
>> eofblocks scan on user/group EDQUOT situations.
>
> That we can. And for the project case, it's a simply and extra flag
> and a call to filemap_flush() to do an async writeback before
> trimming the specualtive preallocation.
>

This makes sense generally. I'll need to dig further into this code to
wrap my head around it. If my current understanding is correct, it
sounds like we could do the same thing in general, non-quota ENOSPC
handling, right? E.g., an unfiltered eofblocks scan (plus the flush bit)
followed by a retry, rather than a complete flush of all dirty inodes.

Brian

...
>
> Cheers,
>
> Dave.
>

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] xfs: Don't flush inodes when project quota exceeded

Dave Chinner
On Wed, Nov 21, 2012 at 10:26:01AM -0500, Brian Foster wrote:

> On 11/20/2012 05:22 PM, Dave Chinner wrote:
> > On Tue, Nov 20, 2012 at 04:25:34PM -0500, Brian Foster wrote:
> >> Otherwise, unless I'm mistaken it sounds like we can use the existing
> >> eofblocks scan on user/group EDQUOT situations.
> >
> > That we can. And for the project case, it's a simply and extra flag
> > and a call to filemap_flush() to do an async writeback before
> > trimming the specualtive preallocation.
>
> This makes sense generally. I'll need to dig further into this code to
> wrap my head around it. If my current understanding is correct, it
> sounds like we could do the same thing in general, non-quota ENOSPC
> handling, right? E.g., an unfiltered eofblocks scan (plus the flush bit)
> followed by a retry, rather than a complete flush of all dirty inodes.

Possibly, though we can also have dirty inodes that don't have
speculative prealloc (e.g. hole filling writes) and we really need to
flush them at ENOSPC, too. In general, I think the real ENOSPC case
needs the big hammer because we have to reclaim every last piece of
unnecessary reservation as possible to allow anything to proceed.
Porject quota ENOSPC isn't such a problem here, as there's still
other free space in the filesystem that allow normal operation to
take place...

Cheers,

Dave.
--
Dave Chinner
[hidden email]

_______________________________________________
xfs mailing list
[hidden email]
http://oss.sgi.com/mailman/listinfo/xfs