# # Makefile in directory src # # resource usage: # # To compile a dynamic module # (1) nvcc cannot accept -fPIC, so compile .cu to .cu.cpp first # nvcc -arch=sm_50 -cuda ../src/PFAC_kernel.cu # # (2) then use g++ to comple PFAC_notex_shared_reorder.cu.cpp # g++ -fPIC -c PFAC_kernel.cu.cpp # # (3) finally combine two object files to a .so library # g++ -shared -o libpfac.so $(LIBS) PFAC_kernel.cu.o ... # # $(LIBS) is necessary when compiling PFAC library on 32-bit machine # include ../common.mk INC_DIR = ../include LIB_DIR = ../lib OBJ_DIR = ../obj INCPATH += -I../include/ CU_SRC = PFAC_kernel.cu CU_SRC += PFAC_reduce_kernel.cu CU_SRC += PFAC_reduce_inplace_kernel.cu CU_SRC += PFAC_kernel_spaceDriven.cu CPP_SRC = PFAC_reorder_Table.cpp CPP_SRC += PFAC_CPU.cpp CPP_SRC += PFAC_CPU_OMP.cpp CPP_SRC += PFAC.cpp inc_files = $(INC_DIR)/PFAC_P.h $(INC_DIR)/PFAC.h 35 CU_OBJ = $(patsubst %.cu,%.o,$(CU_SRC)) CU_CPP = $(patsubst %.cu,%.cu.cpp,$(CU_SRC)) CPP_OBJ = $(patsubst %.cpp,%.o,$(CPP_SRC)) cppobj_loc = $(patsubst %.o,$(OBJ_DIR)/%.o,$(CPP_OBJ)) cppobj_fpic_loc = $(patsubst %.o,$(OBJ_DIR)/%_fpic.o,$(CPP_OBJ)) cu_cpp_sm50_loc = $(patsubst %.cpp,$(OBJ_DIR)/sm50_%.cpp,$(CU_CPP)) cu_cpp_obj_sm50_loc = $(patsubst %.cpp,$(OBJ_DIR)/sm50_%.cpp.o,$(CU_CPP)) all: mk_libso_no50 mk_lib_fpic mk_libso_no50: $(cu_cpp_sm50_loc) $(CXX) -shared -o $(LIB_DIR)/libpfac_sm50.so $(LIBS) $(cu_cpp_obj_sm50_loc) mk_liba: $(cppobj_loc) ar cru $(LIB_DIR)/libpfac.a $(cppobj_loc) ranlib $(LIB_DIR)/libpfac.a mk_lib_fpic: $(cppobj_fpic_loc) $(CXX) -shared -o $(LIB_DIR)/libpfac.so $(cppobj_fpic_loc) $(LIBS) $(OBJ_DIR)/%_fpic.o: %.cpp $(inc_files) $(CXX) -fPIC -c $(CXXFLAGS) $(INCPATH) -o $@ $< $(OBJ_DIR)/PFAC_CPU_OMP_reorder_fpic.o: PFAC_CPU_OMP_reorder.cpp $(inc_files) $(CXX) -fPIC -c $(CXXFLAGS) $(INCPATH) -o $@ $< $(OBJ_DIR)/PFAC_CPU_OMP_reorder.o: PFAC_CPU_OMP_reorder.cpp $(inc_files) $(CXX) -c $(CXXFLAGS) $(INCPATH) -o $@ $< $(OBJ_DIR)/%.o: %.cpp $(inc_files) $(CXX) -c $(CXXFLAGS) $(INCPATH) -o $@ $< $(OBJ_DIR)/sm50_%.cu.cpp: %.cu $(NVCC) -arch=sm_50 -cuda $(INCPATH) -o $@ $< $(CXX) -fPIC -O2 -c -o $@.o $@ #clean : # rm -f *.linkinfo # rm -f $(OBJ_DIR)/* # rm -f $(EXE_DIR)/* ####### Implicit rules .SUFFIXES: .o .c .cpp .cc .cxx .C .cpp.o: $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<" .cc.o: $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<" .cxx.o: $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<" .Co: $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<" .co: $(CC) -c $(CFLAGS) $(INCPATH) -o "$@" "$<" ####### Build rules
simple_example.cpp:64: int main(int, char**): Assertion `PFAC_STATUS_SUCCESS == PFAC_status' failed.
int device_no = 10*deviceProp.major + deviceProp.minor ; if ( 30 == device_no ){ strcpy (modulepath, "libpfac_sm30.so"); }else if ( 21 == device_no ){ strcpy (modulepath, "libpfac_sm21.so"); }else if ( 20 == device_no ){ strcpy (modulepath, "libpfac_sm20.so"); }else if ( 13 == device_no ){ strcpy (modulepath, "libpfac_sm13.so"); }else if ( 12 == device_no ){ strcpy (modulepath, "libpfac_sm12.so"); }else if ( 11 == device_no ){ strcpy (modulepath, "libpfac_sm11.so"); }else{ return PFAC_STATUS_ARCH_MISMATCH ; }
int device_no = 10*deviceProp.major + deviceProp.minor ; if ( 11 > device_no ) return PFAC_STATUS_ARCH_MISMATCH ; sprintf(modulepath, "libpfac_sm%d.so", device_no);
Error: fails to PFAC_matchFromHost, PFAC_STATUS_CUDA_ALLOC_FAILED: allocation fails on device memory.
#define PFAC_PRINTF( ... ) printf( __VA_ARGS__ )
//#define PFAC_PRINTF(...)
Error: cannot bind texture, 11264 bytes invalid texture reference
Error: fails to PFAC_matchFromHost, PFAC_STATUS_CUDA_ALLOC_FAILED: allocation fails on device memory
PFAC_setTextureMode(handle, PFAC_TEXTURE_OFF ) ;
At position 0, match pattern 1
At position 1, match pattern 3
At position 2, match pattern 4
At position 4, match pattern 4
At position 6, match pattern 2
cuda_status = cudaBindTexture( &offset, (const struct textureReference*) texRefTable, (const void*) handle->d_PFAC_table, (const struct cudaChannelFormatDesc*) &channelDesc, handle->sizeOfTableInBytes ) ;
cuda_status = cudaBindTexture( &offset, tex_PFAC_table, (const void*) handle->d_PFAC_table, handle->sizeOfTableInBytes ) ;
PFAC_setTextureMode(handle, PFAC_TEXTURE_ON ) ;
dd if=main.cvd of=main.tar.gz bs=512 skip=1 tar xzvf main.tar.gz
#include <stdio.h> #include <iostream> #include <stdlib.h> #include <string.h> #include <assert.h> #include <chrono> #include <PFAC.h> int main(int argc, char **argv) { if(argc < 2){ printf("args input file, input pattern\n" ); return 0; } char dumpTableFile[] = "table.txt" ; char *inputFile = argv[1]; //"../test/data/example_input" ; char *patternFile = argv[2];//"../test/pattern/example_pattern" ; PFAC_handle_t handle ; PFAC_status_t PFAC_status ; int input_size ; char *h_inputString = NULL ; int *h_matched_result = NULL ; // step 1: create PFAC handle PFAC_status = PFAC_create( &handle ) ; PFAC_status = PFAC_setTextureMode(handle, PFAC_TEXTURE_OFF); printf("%d\n", PFAC_status); assert( PFAC_STATUS_SUCCESS == PFAC_status ); // step 2: read patterns and dump transition table PFAC_status = PFAC_readPatternFromFile( handle, patternFile) ; if ( PFAC_STATUS_SUCCESS != PFAC_status ){ printf("Error: fails to read pattern from file, %s\n", PFAC_getErrorString(PFAC_status) ); exit(1) ; } // dump transition table FILE *table_fp = fopen( dumpTableFile, "w") ; assert( NULL != table_fp ) ; PFAC_status = PFAC_dumpTransitionTable( handle, table_fp ); fclose( table_fp ) ; if ( PFAC_STATUS_SUCCESS != PFAC_status ){ printf("Error: fails to dump transition table, %s\n", PFAC_getErrorString(PFAC_status) ); exit(1) ; } //step 3: prepare input stream FILE* fpin = fopen( inputFile, "rb"); assert ( NULL != fpin ) ; // obtain file size fseek (fpin , 0 , SEEK_END); input_size = ftell (fpin); rewind (fpin); // allocate memory to contain the whole file h_inputString = (char *) malloc (sizeof(char)*input_size); assert( NULL != h_inputString ); h_matched_result = (int *) malloc (sizeof(int)*input_size); assert( NULL != h_matched_result ); memset( h_matched_result, 0, sizeof(int)*input_size ) ; // copy the file into the buffer input_size = fread (h_inputString, 1, input_size, fpin); fclose(fpin); auto started = std::chrono::high_resolution_clock::now(); // step 4: run PFAC on GPU PFAC_status = PFAC_matchFromHost( handle, h_inputString, input_size, h_matched_result ) ; if ( PFAC_STATUS_SUCCESS != PFAC_status ){ printf("Error: fails to PFAC_matchFromHost, %s\n", PFAC_getErrorString(PFAC_status) ); exit(1) ; } auto done = std::chrono::high_resolution_clock::now(); std::cout << "gpu_time: " << std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count()<< std::endl; memset( h_matched_result, 0, sizeof(int)*input_size ) ; PFAC_setPlatform(handle, PFAC_PLATFORM_CPU); started = std::chrono::high_resolution_clock::now(); // step 4: run PFAC on CPU PFAC_status = PFAC_matchFromHost( handle, h_inputString, input_size, h_matched_result ) ; if ( PFAC_STATUS_SUCCESS != PFAC_status ){ printf("Error: fails to PFAC_matchFromHost, %s\n", PFAC_getErrorString(PFAC_status) ); exit(1) ; } done = std::chrono::high_resolution_clock::now(); std::cout << "cpu_time: " << std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count()<< std::endl; memset( h_matched_result, 0, sizeof(int)*input_size ) ; PFAC_setPlatform(handle, PFAC_PLATFORM_CPU_OMP); started = std::chrono::high_resolution_clock::now(); // step 4: run PFAC on OMP PFAC_status = PFAC_matchFromHost( handle, h_inputString, input_size, h_matched_result ) ; if ( PFAC_STATUS_SUCCESS != PFAC_status ){ printf("Error: fails to PFAC_matchFromHost, %s\n", PFAC_getErrorString(PFAC_status) ); exit(1) ; } done = std::chrono::high_resolution_clock::now(); std::cout << "omp_time: " << std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() << std::endl; PFAC_status = PFAC_destroy( handle ) ; assert( PFAC_STATUS_SUCCESS == PFAC_status ); free(h_inputString); free(h_matched_result); return 0; }
export OMP_NUM_THREADS=4
Source: https://habr.com/ru/post/344938/
All Articles