diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index 1464d007b4..0b93f1e130 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2010-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science @@ -528,6 +528,17 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep) ep->fifo = NULL; } +#if OPAL_BTL_VADER_HAVE_XPMEM +static int mca_btl_vader_endpoint_rcache_cleanup (mca_mpool_base_registration_t *reg, void *ctx) +{ + struct mca_rcache_base_module_t *rcache = (struct mca_rcache_base_module_t *) ctx; + /* otherwise dereg will fail on assert */ + reg->ref_count = 0; + (void) rcache->rcache_delete (rcache, reg); + return OPAL_SUCCESS; +} +#endif + static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep) { OBJ_DESTRUCT(&ep->pending_frags); @@ -537,21 +548,10 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep) if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) { if (ep->segment_data.xpmem.rcache) { /* clean out the registration cache */ - const int nregs = 100; - mca_mpool_base_registration_t *regs[nregs]; - int reg_cnt; - - do { - reg_cnt = ep->segment_data.xpmem.rcache->rcache_find_all(ep->segment_data.xpmem.rcache, 0, (size_t)-1, - regs, nregs); - - for (int i = 0 ; i < reg_cnt ; ++i) { - /* otherwise dereg will fail on assert */ - regs[i]->ref_count = 0; - OBJ_RELEASE(regs[i]); - } - } while (reg_cnt == nregs); - + (void) ep->segment_data.xpmem.rcache->rcache_iterate (ep->segment_data.xpmem.rcache, + NULL, (size_t) -1, + mca_btl_vader_endpoint_rcache_cleanup, + (void *) ep->segment_data.xpmem.rcache); ep->segment_data.xpmem.rcache = NULL; } diff --git a/opal/mca/mpool/grdma/mpool_grdma.h b/opal/mca/mpool/grdma/mpool_grdma.h index 7f9c7cbefb..f96b62af4c 100644 --- a/opal/mca/mpool/grdma/mpool_grdma.h +++ b/opal/mca/mpool/grdma/mpool_grdma.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights * reserved. * * $COPYRIGHT$ @@ -28,6 +28,7 @@ #include "opal_config.h" #include "opal/class/opal_list.h" +#include "opal/class/opal_lifo.h" #include "opal/mca/event/event.h" #include "opal/mca/mpool/mpool.h" #if HAVE_SYS_MMAN_H @@ -42,7 +43,7 @@ struct mca_mpool_grdma_pool_t { opal_list_item_t super; char *pool_name; opal_list_t lru_list; - opal_list_t gc_list; + opal_lifo_t gc_lifo; struct mca_rcache_base_module_t *rcache; }; typedef struct mca_mpool_grdma_pool_t mca_mpool_grdma_pool_t; diff --git a/opal/mca/mpool/grdma/mpool_grdma_module.c b/opal/mca/mpool/grdma/mpool_grdma_module.c index 5e997ece60..d99dc3440e 100644 --- a/opal/mca/mpool/grdma/mpool_grdma_module.c +++ b/opal/mca/mpool/grdma/mpool_grdma_module.c @@ -14,7 +14,7 @@ * Copyright (c) 2006 Voltaire. All rights reserved. * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * @@ -61,7 +61,7 @@ static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool) memset ((void *)((uintptr_t)pool + sizeof (pool->super)), 0, sizeof (*pool) - sizeof (pool->super)); OBJ_CONSTRUCT(&pool->lru_list, opal_list_t); - OBJ_CONSTRUCT(&pool->gc_list, opal_list_t); + OBJ_CONSTRUCT(&pool->gc_lifo, opal_lifo_t); pool->rcache = mca_rcache_base_module_create(mca_mpool_grdma_component.rcache_name); } @@ -69,7 +69,7 @@ static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool) static void mca_mpool_grdma_pool_destructor (mca_mpool_grdma_pool_t *pool) { OBJ_DESTRUCT(&pool->lru_list); - OBJ_DESTRUCT(&pool->gc_list); + OBJ_DESTRUCT(&pool->gc_lifo); free (pool->pool_name); } @@ -119,15 +119,10 @@ static inline int dereg_mem(mca_mpool_base_registration_t *reg) if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) reg->mpool->rcache->rcache_delete(reg->mpool->rcache, reg); - /* Drop the rcache lock before deregistring the memory */ - OPAL_THREAD_UNLOCK(®->mpool->rcache->lock); - rc = mpool_grdma->resources.deregister_mem(mpool_grdma->resources.reg_data, - reg); - OPAL_THREAD_LOCK(®->mpool->rcache->lock); - + rc = mpool_grdma->resources.deregister_mem(mpool_grdma->resources.reg_data, reg); if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { - opal_free_list_return (&mpool_grdma->reg_list, - (opal_free_list_item_t *) reg); + opal_free_list_return_mt (&mpool_grdma->reg_list, + (opal_free_list_item_t *) reg); } return rc; @@ -180,11 +175,9 @@ static inline void do_unregistration_gc(struct mca_mpool_base_module_t *mpool) mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; opal_list_item_t *item; - /* Remove registration from garbage collection list - before deregistering it */ - while (NULL != - (item = opal_list_remove_first(&mpool_grdma->pool->gc_list))) { - dereg_mem((mca_mpool_base_registration_t *) item); + /* Remove registration from garbage collection list before deregistering it */ + while (NULL != (item = opal_lifo_pop_atomic (&mpool_grdma->pool->gc_lifo))) { + dereg_mem ((mca_mpool_base_registration_t *) item); } } @@ -193,8 +186,10 @@ static inline bool mca_mpool_grdma_evict_lru_local (mca_mpool_grdma_pool_t *pool mca_mpool_grdma_module_t *mpool_grdma; mca_mpool_base_registration_t *old_reg; + opal_mutex_lock (&pool->rcache->lock); old_reg = (mca_mpool_base_registration_t *) opal_list_remove_first (&pool->lru_list); + opal_mutex_unlock (&pool->rcache->lock); if (NULL == old_reg) { return false; } @@ -220,6 +215,61 @@ bool mca_mpool_grdma_evict (struct mca_mpool_base_module_t *mpool) return mca_mpool_grdma_evict_lru_local (((mca_mpool_grdma_module_t *) mpool)->pool); } +struct mca_rcache_base_find_args_t { + mca_mpool_base_registration_t *reg; + mca_mpool_grdma_module_t *mpool_grdma; + unsigned char *base; + unsigned char *bound; + int access_flags; +}; + +typedef struct mca_rcache_base_find_args_t mca_rcache_base_find_args_t; + +static int mca_mpool_grdma_check_cached (mca_mpool_base_registration_t *grdma_reg, void *ctx) +{ + mca_rcache_base_find_args_t *args = (mca_rcache_base_find_args_t *) ctx; + mca_mpool_grdma_module_t *mpool_grdma = args->mpool_grdma; + + if ((grdma_reg->flags & MCA_MPOOL_FLAGS_INVALID) || &mpool_grdma->super != grdma_reg->mpool || + grdma_reg->base > args->base || grdma_reg->bound < args->bound) { + return 0; + } + + if (OPAL_UNLIKELY((args->access_flags & grdma_reg->access_flags) != args->access_flags)) { + args->access_flags |= grdma_reg->access_flags; + + if (0 != grdma_reg->ref_count) { + if (!(grdma_reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { + grdma_reg->mpool->rcache->rcache_delete (grdma_reg->mpool->rcache, grdma_reg); + } + + /* mark the registration to go away when it is deregistered */ + grdma_reg->flags |= MCA_MPOOL_FLAGS_INVALID | MCA_MPOOL_FLAGS_CACHE_BYPASS; + } else { + if (registration_is_cacheable(grdma_reg)) { + opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg); + } + + dereg_mem (grdma_reg); + } + } else { + if (0 == grdma_reg->ref_count) { + /* Leave pinned must be set for this to still be in the rcache. */ + opal_list_remove_item(&mpool_grdma->pool->lru_list, + (opal_list_item_t *) grdma_reg); + } + + /* This segment fits fully within an existing segment. */ + mpool_grdma->stat_cache_hit++; + (void) opal_atomic_add_32 (&grdma_reg->ref_count, 1); + args->reg = grdma_reg; + return 1; + } + + /* can't use this registration */ + return 0; +} + /* * register memory */ @@ -235,16 +285,12 @@ int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr, unsigned char *base, *bound; int rc; - OPAL_THREAD_LOCK(&mpool->rcache->lock); - *reg = NULL; /* if cache bypass is requested don't use the cache */ base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); - if (!opal_list_is_empty (&mpool_grdma->pool->gc_list)) - do_unregistration_gc(mpool); #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { @@ -257,58 +303,31 @@ int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr, } #endif /* OPAL_CUDA_GDR_SUPPORT */ + do_unregistration_gc(mpool); + /* look through existing regs if not persistent registration requested. * Persistent registration are always registered and placed in the cache */ if(!(bypass_cache || persist)) { + mca_rcache_base_find_args_t find_args = {.reg = NULL, .mpool_grdma = mpool_grdma, + .base = base, .bound = bound, + .access_flags = access_flags}; + /* check to see if memory is registered */ - mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, &grdma_reg); - if (grdma_reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) { - if (OPAL_UNLIKELY((access_flags & grdma_reg->access_flags) != access_flags)) { - access_flags |= grdma_reg->access_flags; - - if (0 != grdma_reg->ref_count) { - if (!(grdma_reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { - grdma_reg->mpool->rcache->rcache_delete(grdma_reg->mpool->rcache, grdma_reg); - } - - /* mark the registration to go away when it is deregistered */ - grdma_reg->flags |= MCA_MPOOL_FLAGS_INVALID | MCA_MPOOL_FLAGS_CACHE_BYPASS; - } else { - if (registration_is_cacheable (grdma_reg)) { - /* pull the item out of the lru */ - opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg); - } - - (void) dereg_mem (grdma_reg); - } - } else { - *reg = grdma_reg; - if (0 == grdma_reg->ref_count) { - /* Leave pinned must be set for this to still be in the rcache. */ - opal_list_remove_item(&mpool_grdma->pool->lru_list, - (opal_list_item_t *) grdma_reg); - } - - /* This segment fits fully within an existing segment. */ - mpool_grdma->stat_cache_hit++; - grdma_reg->ref_count++; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - return OPAL_SUCCESS; - } + rc = mpool->rcache->rcache_iterate (mpool->rcache, base, size, + mca_mpool_grdma_check_cached, (void *) &find_args); + if (1 == rc) { + *reg = find_args.reg; + return OPAL_SUCCESS; } - mpool_grdma->stat_cache_miss++; + /* get updated access flags */ + access_flags = find_args.access_flags; - /* Unless explicitly requested by the caller always store the - * registration in the rcache. This will speed up the case where - * no leave pinned protocol is in use but the same segment is in - * use in multiple simultaneous transactions. We used to set bypass_cache - * here is !mca_mpool_grdma_component.leave_pinned. */ + OPAL_THREAD_ADD32((volatile int32_t *) &mpool_grdma->stat_cache_miss, 1); } - item = opal_free_list_get (&mpool_grdma->reg_list); + item = opal_free_list_get_mt (&mpool_grdma->reg_list); if(NULL == item) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OPAL_ERR_OUT_OF_RESOURCE; } grdma_reg = (mca_mpool_base_registration_t*)item; @@ -318,22 +337,13 @@ int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr, grdma_reg->bound = bound; grdma_reg->flags = flags; grdma_reg->access_flags = access_flags; + grdma_reg->ref_count = 1; #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { mca_common_cuda_get_buffer_id(grdma_reg); } #endif /* OPAL_CUDA_GDR_SUPPORT */ - if (false == bypass_cache) { - rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0); - - if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - opal_free_list_return (&mpool_grdma->reg_list, item); - return rc; - } - } - while (OPAL_ERR_OUT_OF_RESOURCE == (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data, base, bound - base + 1, grdma_reg))) { @@ -344,20 +354,21 @@ int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr, } if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { - if (false == bypass_cache) { - mpool->rcache->rcache_delete(mpool->rcache, grdma_reg); - } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - opal_free_list_return (&mpool_grdma->reg_list, item); + opal_free_list_return_mt (&mpool_grdma->reg_list, item); return rc; } + if (false == bypass_cache) { + rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0); + + if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { + opal_free_list_return_mt (&mpool_grdma->reg_list, item); + return rc; + } + } + *reg = grdma_reg; - (*reg)->ref_count++; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); return OPAL_SUCCESS; } @@ -398,7 +409,7 @@ int mca_mpool_grdma_find(struct mca_mpool_base_module_t *mpool, void *addr, bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), mca_mpool_base_page_size_log); - OPAL_THREAD_LOCK(&mpool->rcache->lock); + opal_mutex_lock (&mpool->rcache->lock); rc = mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, reg); if(NULL != *reg && @@ -412,12 +423,12 @@ int mca_mpool_grdma_find(struct mca_mpool_base_module_t *mpool, void *addr, (opal_list_item_t*)(*reg)); } mpool_grdma->stat_cache_found++; - (*reg)->ref_count++; + (void) opal_atomic_add_32 (&(*reg)->ref_count, 1); } else { mpool_grdma->stat_cache_notfound++; } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + opal_mutex_unlock (&mpool->rcache->lock); return rc; } @@ -426,62 +437,67 @@ int mca_mpool_grdma_deregister(struct mca_mpool_base_module_t *mpool, mca_mpool_base_registration_t *reg) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; - int rc = OPAL_SUCCESS; - assert(reg->ref_count > 0); + int32_t ref_count; + int rc; + + opal_mutex_lock (&mpool_grdma->pool->rcache->lock); + ref_count = opal_atomic_add_32 (®->ref_count, -1); + + assert (ref_count >= 0); + if (ref_count > 0) { + opal_mutex_unlock (&mpool_grdma->pool->rcache->lock); - OPAL_THREAD_LOCK(&mpool->rcache->lock); - reg->ref_count--; - if(reg->ref_count > 0) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); return OPAL_SUCCESS; } if (registration_is_cacheable(reg)) { opal_list_append(&mpool_grdma->pool->lru_list, (opal_list_item_t *) reg); - } else { - rc = dereg_mem (reg); + opal_mutex_unlock (&mpool_grdma->pool->rcache->lock); + + return OPAL_SUCCESS; } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); + rc = dereg_mem (reg); + opal_mutex_unlock (&mpool_grdma->pool->rcache->lock); return rc; } -#define GRDMA_MPOOL_NREGS 100 +static int gc_add (mca_mpool_base_registration_t *grdma_reg, void *ctx) +{ + mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) grdma_reg->mpool; + + /* unused */ + (void) ctx; + + if (grdma_reg->flags & MCA_MPOOL_FLAGS_INVALID) { + /* nothing more to do */ + return OPAL_SUCCESS; + } + + if (grdma_reg->ref_count) { + /* attempted to remove an active registration */ + return OPAL_ERROR; + } + + /* This may be called from free() so avoid recursively calling into free by just + * shifting this registration into the garbage collection list. The cleanup will + * be done on the next registration attempt. */ + if (registration_is_cacheable (grdma_reg)) { + opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg); + } + + grdma_reg->flags |= MCA_MPOOL_FLAGS_INVALID; + + opal_lifo_push_atomic (&mpool_grdma->pool->gc_lifo, (opal_list_item_t *) grdma_reg); + + return OPAL_SUCCESS; +} int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool, void *base, size_t size) { - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; - mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; - int reg_cnt, i, rc = OPAL_SUCCESS; - - OPAL_THREAD_LOCK(&mpool->rcache->lock); - do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, base, size, - regs, GRDMA_MPOOL_NREGS); - - for(i = 0 ; i < reg_cnt ; ++i) { - regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID; - if (regs[i]->ref_count) { - /* memory is being freed, but there are registration in use that - * covers the memory. This can happen even in a correct program, - * but may also be an user error. We can't tell. Mark the - * registration as invalid. It will not be used any more and - * will be unregistered when ref_count will become zero */ - rc = OPAL_ERROR; /* tell caller that something was wrong */ - } else { - opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]); - opal_list_append(&mpool_grdma->pool->gc_list, (opal_list_item_t *) regs[i]); - } - } - } while(reg_cnt == GRDMA_MPOOL_NREGS); - - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - return rc; + return mpool->rcache->rcache_iterate (mpool->rcache, base, size, gc_add, NULL); } /* Make sure this registration request is not stale. In other words, ensure @@ -492,8 +508,6 @@ int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool, static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; - mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; - int reg_cnt, i, rc = OPAL_SUCCESS; mca_mpool_base_registration_t *reg; mpool->rcache->rcache_find(mpool->rcache, addr, size, ®); @@ -506,44 +520,34 @@ static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *add return OPAL_SUCCESS; } - /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */ - - /* This memory has been freed. Find all registrations and delete */ - do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, reg->base, reg->bound - reg->base + 1, - regs, GRDMA_MPOOL_NREGS); - for(i = 0 ; i < reg_cnt ; ++i) { - regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID; - if (regs[i]->ref_count) { - opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d", - regs[i]->ref_count, regs[i]->base, regs[i]->bound, - (int) (regs[i]->bound - regs[i]->base + 1)); - /* memory is being freed, but there are registration in use that - * covers the memory. This can happen even in a correct program, - * but may also be an user error. We can't tell. Mark the - * registration as invalid. It will not be used any more and - * will be unregistered when ref_count will become zero */ - rc = OPAL_ERROR; /* tell caller that something was wrong */ - } else { - opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]); - /* Now deregister. Do not use gc_list as we need to kick this out now. */ - dereg_mem(regs[i]); - } - } - } while(reg_cnt == GRDMA_MPOOL_NREGS); + /* This memory has been freed. Find all registrations and delete. Ensure they are deregistered + * now by passing dereg_mem as the delete function. This is safe because the rcache lock is + * recursive and this is only called from register. */ + return mpool->rcache->rcache_iterate (mpool->rcache, base, size, gc_add, NULL); +} +#endif /* OPAL_CUDA_GDR_SUPPORT */ - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "After free");*/ +static int iterate_dereg_finalize (mca_mpool_base_registration_t *grdma_reg, void *ctx) +{ + mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) ctx; - return rc; + if ((mca_mpool_base_module_t *) mpool_grdma != grdma_reg->mpool) { + return 0; + } + + if (registration_is_cacheable (grdma_reg)) { + opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg); + } + + /* set the reference count to 0 otherwise dereg will fail on assert */ + grdma_reg->ref_count = 0; + + return dereg_mem (grdma_reg); } -#endif /* OPAL_CUDA_GDR_SUPPORT */ void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; - mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; - int reg_cnt, i; /* Statistic */ if (true == mca_mpool_grdma_component.print_stats) { @@ -555,33 +559,14 @@ void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool) mpool_grdma->stat_evicted); } - OPAL_THREAD_LOCK(&mpool->rcache->lock); do_unregistration_gc(mpool); - do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, - regs, GRDMA_MPOOL_NREGS); - - for (i = 0 ; i < reg_cnt ; ++i) { - if (regs[i]->ref_count) { - regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */ - } else if (mca_mpool_grdma_component.leave_pinned) { - opal_list_remove_item(&mpool_grdma->pool->lru_list, - (opal_list_item_t *) regs[i]); - } - - (void) dereg_mem(regs[i]); - } - } while (reg_cnt == GRDMA_MPOOL_NREGS); + (void) mpool->rcache->rcache_iterate (mpool->rcache, NULL, (size_t) -1, + iterate_dereg_finalize, (void *) mpool); OBJ_RELEASE(mpool_grdma->pool); - OBJ_DESTRUCT(&mpool_grdma->reg_list); - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); /* this mpool was allocated by grdma_init in mpool_grdma_component.c */ free(mpool); diff --git a/opal/mca/rcache/rcache.h b/opal/mca/rcache/rcache.h index 729b900536..97ac0ca79a 100644 --- a/opal/mca/rcache/rcache.h +++ b/opal/mca/rcache/rcache.h @@ -59,6 +59,10 @@ typedef int (*mca_rcache_base_module_clean_fn_t)( typedef void (*mca_rcache_base_module_dump_range_fn_t)( struct mca_rcache_base_module_t* rcache, unsigned char* addr, size_t size, char *msg); +typedef int (*mca_rcache_base_module_iterate_fn_t)( + struct mca_rcache_base_module_t* rcache, unsigned char *base, size_t size, + int (*callback_fn) (mca_mpool_base_registration_t *, void *), void *ctx); + /** * finalize */ @@ -93,6 +97,7 @@ struct mca_rcache_base_module_t { mca_rcache_base_module_clean_fn_t rcache_clean; mca_rcache_base_module_finalize_fn_t rcache_finalize; mca_rcache_base_module_dump_range_fn_t rcache_dump_range; + mca_rcache_base_module_iterate_fn_t rcache_iterate; opal_mutex_t lock; }; typedef struct mca_rcache_base_module_t mca_rcache_base_module_t; diff --git a/opal/mca/rcache/vma/rcache_vma.c b/opal/mca/rcache/vma/rcache_vma.c index 8c9bd5e6f0..55a332871b 100644 --- a/opal/mca/rcache/vma/rcache_vma.c +++ b/opal/mca/rcache/vma/rcache_vma.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -13,6 +14,8 @@ * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -39,9 +42,9 @@ void mca_rcache_vma_module_init( mca_rcache_vma_module_t* rcache ) { rcache->base.rcache_find_all = mca_rcache_vma_find_all; rcache->base.rcache_insert = mca_rcache_vma_insert; rcache->base.rcache_delete = mca_rcache_vma_delete; - rcache->base.rcache_clean = mca_rcache_vma_clean; rcache->base.rcache_finalize = mca_rcache_vma_finalize; rcache->base.rcache_dump_range = mca_rcache_vma_dump_range; + rcache->base.rcache_iterate = mca_rcache_vma_iterate; OBJ_CONSTRUCT(&rcache->base.lock, opal_recursive_mutex_t); mca_rcache_vma_tree_init(rcache); } @@ -139,29 +142,13 @@ int mca_rcache_vma_delete(struct mca_rcache_base_module_t* rcache, return mca_rcache_vma_tree_delete(vma_rcache, reg); } -int mca_rcache_vma_clean(struct mca_rcache_base_module_t* rcache) +int mca_rcache_vma_iterate (struct mca_rcache_base_module_t* rcache, + unsigned char *base, size_t size, + int (*callback_fn) (mca_mpool_base_registration_t *, void *), + void *ctx) { mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; - mca_rcache_vma_t *vma; - opal_list_item_t *i; - - do { - OPAL_THREAD_LOCK(&rcache->lock); - i = opal_list_get_first(&vma_rcache->vma_delete_list); - if(opal_list_get_end(&vma_rcache->vma_delete_list) == i) { - vma = NULL; - OPAL_THREAD_UNLOCK(&rcache->lock); - } else { - vma = (mca_rcache_vma_t *)i; - opal_list_remove_item(&vma_rcache->vma_delete_list, &vma->super); - - /* Need to drop the rcache lock before destroying the vma */ - OPAL_THREAD_UNLOCK(&rcache->lock); - - mca_rcache_vma_destroy(vma); - } - } while (NULL != vma); - return OPAL_SUCCESS; + return mca_rcache_vma_tree_iterate (vma_rcache, base, size, callback_fn, ctx); } /** diff --git a/opal/mca/rcache/vma/rcache_vma.h b/opal/mca/rcache/vma/rcache_vma.h index 0306fc0bba..984a05eb4e 100644 --- a/opal/mca/rcache/vma/rcache_vma.h +++ b/opal/mca/rcache/vma/rcache_vma.h @@ -1,25 +1,27 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/** - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ /** * @file * Description of the Registration Cache framework @@ -38,7 +40,6 @@ struct mca_rcache_vma_module_t { mca_rcache_base_module_t base; opal_rb_tree_t rb_tree; opal_list_t vma_list; - opal_list_t vma_delete_list; size_t reg_cur_cache_size; }; typedef struct mca_rcache_vma_module_t mca_rcache_vma_module_t; @@ -67,13 +68,6 @@ int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache, int mca_rcache_vma_delete(struct mca_rcache_base_module_t* rcache, mca_mpool_base_registration_t* registration); -/* It is not safe to call mca_rcache_vma_clean with the rcache lock held */ -int mca_rcache_vma_clean(struct mca_rcache_base_module_t* rcache); -/* Destroy vma objects which are on the deferred delete list. These were placed - on the list earlier when the rcache lock was held and it was not safe to - destory them. They should not be linked into any other structure anymore except - the vma_list_delete list */ - /** * init/finalize */ @@ -86,6 +80,25 @@ void mca_rcache_vma_dump_range(struct mca_rcache_base_module_t *rcache, unsigned char* addr, size_t size, char *msg); +/** + * Iterate over registrations in the specified range. + * + * @param[in] vma_module vma tree + * @param[in] base base address of region + * @param[in] size size of region + * @param[in] callback_fn function to call for each matching registration handle + * @param[in] ctx callback context + * + * The callback will be made with the vma lock held. This is a recursive lock so + * it is still safe to call any vma functions on this vma_module. Keep in mind it + * is only safe to call mca_rcache_base_vma_delete() on the supplied registration + * from the callback. The iteration will terminate if the callback returns anything + * other than OPAL_SUCCESS. + */ +int mca_rcache_vma_iterate (mca_rcache_base_module_t *rcache, unsigned char *base, size_t size, + int (*callback_fn) (mca_mpool_base_registration_t *, void *), + void *ctx); + END_C_DECLS #endif /* MCA_RCACHE_VMA_H */ diff --git a/opal/mca/rcache/vma/rcache_vma_tree.c b/opal/mca/rcache/vma/rcache_vma_tree.c index 1c1d765bd3..9891f57e1a 100644 --- a/opal/mca/rcache/vma/rcache_vma_tree.c +++ b/opal/mca/rcache/vma/rcache_vma_tree.c @@ -16,7 +16,7 @@ * Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -265,7 +265,6 @@ int mca_rcache_vma_tree_init(mca_rcache_vma_module_t* rcache) { OBJ_CONSTRUCT(&rcache->rb_tree, opal_rb_tree_t); OBJ_CONSTRUCT(&rcache->vma_list, opal_list_t); - OBJ_CONSTRUCT(&rcache->vma_delete_list, opal_list_t); rcache->reg_cur_cache_size = 0; return opal_rb_tree_init(&rcache->rb_tree, mca_rcache_vma_tree_node_compare); @@ -275,7 +274,6 @@ void mca_rcache_vma_tree_finalize(mca_rcache_vma_module_t* rcache) { opal_rb_tree_init(&rcache->rb_tree, mca_rcache_vma_tree_node_compare); - OBJ_DESTRUCT(&rcache->vma_delete_list); OBJ_DESTRUCT(&rcache->vma_list); OBJ_DESTRUCT(&rcache->rb_tree); } @@ -287,26 +285,29 @@ mca_mpool_base_registration_t *mca_rcache_vma_tree_find( mca_rcache_vma_t *vma; mca_rcache_vma_reg_list_item_t *item; + opal_mutex_lock (&vma_rcache->base.lock); + vma = (mca_rcache_vma_t*)opal_rb_tree_find_with(&vma_rcache->rb_tree, base, mca_rcache_vma_tree_node_compare_search); - if(!vma) + if(!vma) { + opal_mutex_unlock (&vma_rcache->base.lock); return NULL; + } - for(item = (mca_rcache_vma_reg_list_item_t*) - opal_list_get_first(&vma->reg_list); - item != (mca_rcache_vma_reg_list_item_t*) - opal_list_get_end(&vma->reg_list); - item = (mca_rcache_vma_reg_list_item_t*) - opal_list_get_next(item)) { + OPAL_LIST_FOREACH(item, &vma->reg_list, mca_rcache_vma_reg_list_item_t) { if(item->reg->flags & MCA_MPOOL_FLAGS_INVALID) continue; - if(item->reg->bound >= bound) + if(item->reg->bound >= bound) { + opal_mutex_unlock (&vma_rcache->base.lock); return item->reg; + } if(!(item->reg->flags & MCA_MPOOL_FLAGS_PERSIST)) break; } + opal_mutex_unlock (&vma_rcache->base.lock); + return NULL; } @@ -333,9 +334,11 @@ int mca_rcache_vma_tree_find_all( if(opal_list_get_size(&vma_rcache->vma_list) == 0) return cnt; + opal_mutex_lock (&vma_rcache->base.lock); + do { + mca_rcache_vma_reg_list_item_t *vma_item; mca_rcache_vma_t *vma; - opal_list_item_t *item; vma = (mca_rcache_vma_t*) opal_rb_tree_find_with(&vma_rcache->rb_tree, base, mca_rcache_vma_tree_node_compare_closest); @@ -350,26 +353,81 @@ int mca_rcache_vma_tree_find_all( continue; } - for(item = opal_list_get_first(&vma->reg_list); - item != opal_list_get_end(&vma->reg_list); - item = opal_list_get_next(item)) { - mca_rcache_vma_reg_list_item_t *vma_item; - vma_item = (mca_rcache_vma_reg_list_item_t*)item; + OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_vma_reg_list_item_t) { if((vma_item->reg->flags & MCA_MPOOL_FLAGS_INVALID) || is_reg_in_array(regs, cnt, vma_item->reg)) { continue; } regs[cnt++] = vma_item->reg; - if(cnt == reg_cnt) + if(cnt == reg_cnt) { + opal_mutex_unlock (&vma_rcache->base.lock); return cnt; /* no space left in the provided array */ + } } base = (unsigned char *)vma->end + 1; - } while(bound >= base); + } while (bound >= base); + + opal_mutex_unlock (&vma_rcache->base.lock); return cnt; } +int mca_rcache_vma_tree_iterate (mca_rcache_vma_module_t *vma_module, unsigned char *base, + size_t size, int (*callback_fn) (mca_mpool_base_registration_t *, void *), + void *ctx) +{ + unsigned char *bound = base + size - 1; + mca_rcache_vma_t *vma; + int rc = OPAL_SUCCESS; + + if (opal_list_get_size(&vma_module->vma_list) == 0) { + /* nothin to do */ + return OPAL_SUCCESS; + } + + opal_mutex_lock (&vma_module->base.lock); + + do { + mca_rcache_vma_reg_list_item_t *vma_item, *next; + vma = (mca_rcache_vma_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base, + mca_rcache_vma_tree_node_compare_closest); + + if (NULL == vma) { + /* base is bigger than any registered memory */ + break; + } + + if (base < (unsigned char *) vma->start) { + base = (unsigned char *) vma->start; + continue; + } + + base = (unsigned char *)vma->end + 1; + + /* all the registrations in the vma may be deleted by the callback so keep a + * reference until we are done with it. */ + OBJ_RETAIN(vma); + + OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_vma_reg_list_item_t) { + rc = callback_fn (vma_item->reg, ctx); + if (OPAL_SUCCESS != rc) { + break; + } + } + + OBJ_RELEASE(vma); + + if (OPAL_SUCCESS != rc) { + break; + } + } while (bound >= base); + + opal_mutex_unlock (&vma_module->base.lock); + + return rc; +} + static inline int mca_rcache_vma_can_insert( mca_rcache_vma_module_t *vma_rcache, size_t nbytes, size_t limit) { @@ -395,6 +453,8 @@ int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* vma_rcache, mca_rcache_vma_t *i; uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound; + opal_mutex_lock (&vma_rcache->base.lock); + i = (mca_rcache_vma_t*)opal_rb_tree_find_with(&vma_rcache->rb_tree, (void*)begin, mca_rcache_vma_tree_node_compare_closest); @@ -472,9 +532,12 @@ int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* vma_rcache, i = (mca_rcache_vma_t*)opal_list_get_next(&i->super); } + opal_mutex_unlock (&vma_rcache->base.lock); + return OPAL_SUCCESS; remove: + opal_mutex_unlock (&vma_rcache->base.lock); mca_rcache_vma_tree_delete(vma_rcache, reg); return OPAL_ERR_TEMP_OUT_OF_RESOURCE; } @@ -488,8 +551,9 @@ int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* vma_rcache, * @retval OPAL_ERR_BAD_PARAM if the passed base pointer was invalid */ int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, - mca_mpool_base_registration_t* reg) + mca_mpool_base_registration_t* reg) { + opal_list_t deleted_vmas; mca_rcache_vma_t *vma; vma = (mca_rcache_vma_t*)opal_rb_tree_find_with(&vma_rcache->rb_tree, reg->base, @@ -498,6 +562,10 @@ int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, if(!vma) return OPAL_ERROR; + opal_mutex_lock (&vma_rcache->base.lock); + + OBJ_CONSTRUCT(&deleted_vmas, opal_list_t); + while(vma != (mca_rcache_vma_t*)opal_list_get_end(&vma_rcache->vma_list) && vma->start <= (uintptr_t)reg->bound) { mca_rcache_vma_remove_reg(vma, reg); @@ -508,7 +576,7 @@ int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, mca_rcache_vma_update_byte_count(vma_rcache, vma->start - vma->end - 1); opal_list_remove_item(&vma_rcache->vma_list, &vma->super); - opal_list_append(&vma_rcache->vma_delete_list, &vma->super); + opal_list_append(&deleted_vmas, &vma->super); vma = next; } else { int merged; @@ -525,7 +593,7 @@ int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, prev->end = vma->end; opal_list_remove_item(&vma_rcache->vma_list, &vma->super); opal_rb_tree_delete(&vma_rcache->rb_tree, vma); - opal_list_append(&vma_rcache->vma_delete_list, &vma->super); + opal_list_append(&deleted_vmas, &vma->super); vma = prev; merged = 1; } @@ -538,13 +606,19 @@ int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, vma->end = next->end; opal_list_remove_item(&vma_rcache->vma_list, &next->super); opal_rb_tree_delete(&vma_rcache->rb_tree, next); - opal_list_append(&vma_rcache->vma_delete_list, &next->super); + opal_list_append(&deleted_vmas, &next->super); merged = 1; } } while(merged); vma = (mca_rcache_vma_t*)opal_list_get_next(vma); } } + + opal_mutex_unlock (&vma_rcache->base.lock); + + /* actually free vmas now that the lock has been dropped */ + OPAL_LIST_DESTRUCT(&deleted_vmas); + return 0; } @@ -566,6 +640,8 @@ void mca_rcache_vma_tree_dump_range(mca_rcache_vma_module_t *vma_rcache, return; } + opal_mutex_lock (&vma_rcache->base.lock); + do { mca_rcache_vma_t *vma; opal_list_item_t *item; @@ -593,8 +669,10 @@ void mca_rcache_vma_tree_dump_range(mca_rcache_vma_module_t *vma_rcache, vma_item = (mca_rcache_vma_reg_list_item_t*)item; reg = vma_item->reg; opal_output(0, " reg: base=%p, bound=%p, alloc_base=%p, ref_count=%d, flags=0x%x", - reg->base, reg->bound, reg->alloc_base, reg->ref_count, reg->flags); + (void *) reg->base, (void *) reg->bound, (void *) reg->alloc_base, + reg->ref_count, reg->flags); } base = (unsigned char *)vma->end + 1; } while(bound >= base); + opal_mutex_unlock (&vma_rcache->base.lock); } diff --git a/opal/mca/rcache/vma/rcache_vma_tree.h b/opal/mca/rcache/vma/rcache_vma_tree.h index 77884eba2f..9338a10a42 100644 --- a/opal/mca/rcache/vma/rcache_vma_tree.h +++ b/opal/mca/rcache/vma/rcache_vma_tree.h @@ -1,26 +1,28 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/** - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. + * + * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ /** * @file * Description of the Registration Cache framework @@ -112,5 +114,14 @@ void mca_rcache_vma_tree_dump_range(mca_rcache_vma_module_t *vma_rcache, unsigned char *base, size_t size, char *msg); +/* + * Iterate over matching registration handles in the tree. + */ +int mca_rcache_vma_tree_iterate (mca_rcache_vma_module_t *vma_module, + unsigned char *base, size_t size, + int (*callback_fn) (mca_mpool_base_registration_t *, void *), + void *ctx); + + #endif /* MCA_RCACHE_VMA_TREE_H */