You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

777 lines
24 KiB

/***********************************************************************
* © 2016 and later: Unicode, Inc. and others.
* License & terms of use: http://www.unicode.org/copyright.html
*
***********************************************************************
***********************************************************************
* COPYRIGHT:
* Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
*
***********************************************************************/
/********************************************************************************
*
* File ubrkperf.cpp
*
* Modification History:
* Name Description
* Vladimir Weinstein First Version, based on collperf
*
*********************************************************************************
*/
//
// This program tests break iterator performance
// Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
// (if any)
// A text file is required as input. It must be in utf-8 or utf-16 format,
// and include a byte order mark. Either LE or BE format is OK.
//
const char gUsageString[] =
"usage: ubrkperf options...\n"
"-help Display this message.\n"
"-file file_name utf-16/utf-8 format file.\n"
"-locale name ICU locale to use. Default is en_US\n"
"-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
" see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
"-win Run test using Windows native services. (currently not working) (ICU is default)\n"
"-unix Run test using Unix word breaking services. (currently not working) \n"
"-mac Run test using MacOSX word breaking services.\n"
"-uselen Use API with string lengths. Default is null-terminated strings\n"
"-char Use character break iterator\n"
"-word Use word break iterator\n"
"-line Use line break iterator\n"
"-sentence Use sentence break iterator\n"
"-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
"-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
" under test at each call point. For measuring test overhead.\n"
"-terse Terse numbers-only output. Intended for use by scripts.\n"
"-dump Display stuff.\n"
"-capi Use C APIs instead of C++ APIs (currently not working)\n"
"-next Do the next test\n"
"-isBound Do the isBound test\n"
;
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <locale.h>
#include <errno.h>
#include <sys/stat.h>
#include <unicode/utypes.h>
#include <unicode/ucol.h>
#include <unicode/ucoleitr.h>
#include <unicode/uloc.h>
#include <unicode/ustring.h>
#include <unicode/ures.h>
#include <unicode/uchar.h>
#include <unicode/ucnv.h>
#include <unicode/utf8.h>
#include <unicode/brkiter.h>
#if U_PLATFORM_HAS_WIN32_API
#include <windows.h>
#else
//
// Stubs for Windows API functions when building on UNIXes.
//
#include <sys/time.h>
unsigned long timeGetTime() {
struct timeval t;
gettimeofday(&t, 0);
unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
val += t.tv_usec / 1000;
return val;
};
#define MAKELCID(a,b) 0
#endif
//
// Command line option variables
// These global variables are set according to the options specified
// on the command line by the user.
char * opt_fName = 0;
char * opt_locale = "en_US";
int opt_langid = 0; // Defaults to value corresponding to opt_locale.
char * opt_rules = 0;
UBool opt_help = FALSE;
int opt_time = 0;
int opt_loopCount = 0;
int opt_passesCount= 1;
UBool opt_terse = FALSE;
UBool opt_icu = TRUE;
UBool opt_win = FALSE; // Run with Windows native functions.
UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
UBool opt_mac = FALSE; // Run with MacOSX word break services.
UBool opt_uselen = FALSE;
UBool opt_dump = FALSE;
UBool opt_char = FALSE;
UBool opt_word = FALSE;
UBool opt_line = FALSE;
UBool opt_sentence = FALSE;
UBool opt_capi = FALSE;
UBool opt_next = FALSE;
UBool opt_isBound = FALSE;
//
// Definitions for the command line options
//
struct OptSpec {
const char *name;
enum {FLAG, NUM, STRING} type;
void *pVar;
};
OptSpec opts[] = {
{"-file", OptSpec::STRING, &opt_fName},
{"-locale", OptSpec::STRING, &opt_locale},
{"-langid", OptSpec::NUM, &opt_langid},
{"-win", OptSpec::FLAG, &opt_win},
{"-unix", OptSpec::FLAG, &opt_unix},
{"-mac", OptSpec::FLAG, &opt_mac},
{"-uselen", OptSpec::FLAG, &opt_uselen},
{"-loop", OptSpec::NUM, &opt_loopCount},
{"-time", OptSpec::NUM, &opt_time},
{"-passes", OptSpec::NUM, &opt_passesCount},
{"-char", OptSpec::FLAG, &opt_char},
{"-word", OptSpec::FLAG, &opt_word},
{"-line", OptSpec::FLAG, &opt_line},
{"-sentence", OptSpec::FLAG, &opt_sentence},
{"-terse", OptSpec::FLAG, &opt_terse},
{"-dump", OptSpec::FLAG, &opt_dump},
{"-capi", OptSpec::FLAG, &opt_capi},
{"-next", OptSpec::FLAG, &opt_next},
{"-isBound", OptSpec::FLAG, &opt_isBound},
{"-help", OptSpec::FLAG, &opt_help},
{"-?", OptSpec::FLAG, &opt_help},
{0, OptSpec::FLAG, 0}
};
//---------------------------------------------------------------------------
//
// Global variables pointing to and describing the test file
//
//---------------------------------------------------------------------------
//DWORD gWinLCID;
BreakIterator *brkit = NULL;
UChar *text = NULL;
int32_t textSize = 0;
#if U_PLATFORM_IS_DARWIN_BASED
#include <ApplicationServices/ApplicationServices.h>
enum{
kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
};
UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
TextBreakLocatorRef breakRef;
UCTextBreakType macBreakType;
void createMACBrkIt() {
OSStatus status = noErr;
LocaleRef lref;
status = LocaleRefFromLocaleString(opt_locale, &lref);
status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
if(opt_char == TRUE) {
macBreakType = kUCTextBreakClusterMask;
} else if(opt_word == TRUE) {
macBreakType = kUCTextBreakWordMask;
} else if(opt_line == TRUE) {
macBreakType = kUCTextBreakLineMask;
} else if(opt_sentence == TRUE) {
// error
// brkit = BreakIterator::createSentenceInstance(opt_locale, status);
} else {
// default is character iterator
macBreakType = kUCTextBreakClusterMask;
}
}
#endif
void createICUBrkIt() {
//
// Set up an ICU break iterator
//
UErrorCode status = U_ZERO_ERROR;
if(opt_char == TRUE) {
brkit = BreakIterator::createCharacterInstance(opt_locale, status);
} else if(opt_word == TRUE) {
brkit = BreakIterator::createWordInstance(opt_locale, status);
} else if(opt_line == TRUE) {
brkit = BreakIterator::createLineInstance(opt_locale, status);
} else if(opt_sentence == TRUE) {
brkit = BreakIterator::createSentenceInstance(opt_locale, status);
} else {
// default is character iterator
brkit = BreakIterator::createCharacterInstance(opt_locale, status);
}
if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
}
if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
}
}
//---------------------------------------------------------------------------
//
// ProcessOptions() Function to read the command line options.
//
//---------------------------------------------------------------------------
UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
{
int i;
int argNum;
const char *pArgName;
OptSpec *pOpt;
for (argNum=1; argNum<argc; argNum++) {
pArgName = argv[argNum];
for (pOpt = opts; pOpt->name != 0; pOpt++) {
if (strcmp(pOpt->name, pArgName) == 0) {
switch (pOpt->type) {
case OptSpec::FLAG:
*(UBool *)(pOpt->pVar) = TRUE;
break;
case OptSpec::STRING:
argNum ++;
if (argNum >= argc) {
fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
return FALSE;
}
*(const char **)(pOpt->pVar) = argv[argNum];
break;
case OptSpec::NUM:
argNum ++;
if (argNum >= argc) {
fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
return FALSE;
}
char *endp;
i = strtol(argv[argNum], &endp, 0);
if (endp == argv[argNum]) {
fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
return FALSE;
}
*(int *)(pOpt->pVar) = i;
}
break;
}
}
if (pOpt->name == 0)
{
fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
return FALSE;
}
}
return TRUE;
}
void doForwardTest() {
if (opt_terse == FALSE) {
printf("Doing the forward test\n");
}
int32_t noBreaks = 0;
int32_t i = 0;
unsigned long startTime = timeGetTime();
unsigned long elapsedTime = 0;
if(opt_icu) {
createICUBrkIt();
brkit->setText(UnicodeString(text, textSize));
brkit->first();
if (opt_terse == FALSE) {
printf("Warmup\n");
}
int j;
while((j = brkit->next()) != BreakIterator::DONE) {
noBreaks++;
//fprintf(stderr, "%d ", j);
}
if (opt_terse == FALSE) {
printf("Measure\n");
}
startTime = timeGetTime();
for(i = 0; i < opt_loopCount; i++) {
brkit->first();
while(brkit->next() != BreakIterator::DONE) {
}
}
elapsedTime = timeGetTime()-startTime;
} else if(opt_mac) {
#if U_PLATFORM_IS_DARWIN_BASED
createMACBrkIt();
UniChar* filePtr = text;
OSStatus status = noErr;
UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
startOffset = 0;
//printf("\t---Search forward--\n");
while (startOffset < numUniChars)
{
status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
startOffset, &breakOffset);
//require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
//require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
// Output break
//printf("\t%d\n", (int)breakOffset);
// Increment counters
noBreaks++;
startOffset = breakOffset;
}
startTime = timeGetTime();
for(i = 0; i < opt_loopCount; i++) {
startOffset = 0;
while (startOffset < numUniChars)
{
status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
startOffset, &breakOffset);
// Increment counters
startOffset = breakOffset;
}
}
elapsedTime = timeGetTime()-startTime;
UCDisposeTextBreakLocator(&breakRef);
#endif
}
if (opt_terse == FALSE) {
int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
printf("forward break iteration average loop time %d\n", loopTime);
printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
} else {
printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
}
}
void doIsBoundTest() {
int32_t noBreaks = 0, hit = 0;
int32_t i = 0, j = 0;
unsigned long startTime = timeGetTime();
unsigned long elapsedTime = 0;
createICUBrkIt();
brkit->setText(UnicodeString(text, textSize));
brkit->first();
for(j = 0; j < textSize; j++) {
if(brkit->isBoundary(j)) {
noBreaks++;
//fprintf(stderr, "%d ", j);
}
}
/*
while(brkit->next() != BreakIterator::DONE) {
noBreaks++;
}
*/
startTime = timeGetTime();
for(i = 0; i < opt_loopCount; i++) {
for(j = 0; j < textSize; j++) {
if(brkit->isBoundary(j)) {
hit++;
}
}
}
elapsedTime = timeGetTime()-startTime;
int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
if (opt_terse == FALSE) {
int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
printf("forward break iteration average loop time %d\n", loopTime);
printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
} else {
printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
}
}
//----------------------------------------------------------------------------------------
//
// UnixConvert -- Convert the lines of the file to the encoding for UNIX
// Since it appears that Unicode support is going in the general
// direction of the use of UTF-8 locales, that is the approach
// that is used here.
//
//----------------------------------------------------------------------------------------
void UnixConvert() {
#if 0
int line;
UConverter *cvrtr; // An ICU code page converter.
UErrorCode status = U_ZERO_ERROR;
cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
if (U_FAILURE(status)) {
fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
exit(-1);
}
// redo for unix
for (line=0; line < gNumFileLines; line++) {
int sizeNeeded = ucnv_fromUChars(cvrtr,
0, // ptr to target buffer.
0, // length of target buffer.
gFileLines[line].name,
-1, // source is null terminated
&status);
if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
exit(-1);
}
status = U_ZERO_ERROR;
gFileLines[line].unixName = new char[sizeNeeded+1];
sizeNeeded = ucnv_fromUChars(cvrtr,
gFileLines[line].unixName, // ptr to target buffer.
sizeNeeded+1, // length of target buffer.
gFileLines[line].name,
-1, // source is null terminated
&status);
if (U_FAILURE(status)) {
fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
exit(-1);
}
gFileLines[line].unixName[sizeNeeded] = 0;
};
ucnv_close(cvrtr);
#endif
}
//----------------------------------------------------------------------------------------
//
// class UCharFile Class to hide all the gorp to read a file in
// and produce a stream of UChars.
//
//----------------------------------------------------------------------------------------
class UCharFile {
public:
UCharFile(const char *fileName);
~UCharFile();
UChar get();
UBool eof() {return fEof;};
UBool error() {return fError;};
int32_t size() { return fFileSize; };
private:
UCharFile (const UCharFile &other) {}; // No copy constructor.
UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op
FILE *fFile;
const char *fName;
UBool fEof;
UBool fError;
UChar fPending2ndSurrogate;
int32_t fFileSize;
enum {UTF16LE, UTF16BE, UTF8} fEncoding;
};
UCharFile::UCharFile(const char * fileName) {
fEof = FALSE;
fError = FALSE;
fName = fileName;
struct stat buf;
int32_t result = stat(fileName, &buf);
if(result != 0) {
fprintf(stderr, "Error getting info\n");
fFileSize = -1;
} else {
fFileSize = buf.st_size;
}
fFile = fopen(fName, "rb");
fPending2ndSurrogate = 0;
if (fFile == NULL) {
fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
fError = TRUE;
return;
}
//
// Look for the byte order mark at the start of the file.
//
int BOMC1, BOMC2, BOMC3;
BOMC1 = fgetc(fFile);
BOMC2 = fgetc(fFile);
if (BOMC1 == 0xff && BOMC2 == 0xfe) {
fEncoding = UTF16LE; }
else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
fEncoding = UTF16BE; }
else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
fEncoding = UTF8; }
else
{
fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
"must include a BOM.\n", fileName);
fError = true;
return;
}
}
UCharFile::~UCharFile() {
fclose(fFile);
}
UChar UCharFile::get() {
UChar c;
switch (fEncoding) {
case UTF16LE:
{
int cL, cH;
cL = fgetc(fFile);
cH = fgetc(fFile);
c = cL | (cH << 8);
if (cH == EOF) {
c = 0;
fEof = TRUE;
}
break;
}
case UTF16BE:
{
int cL, cH;
cH = fgetc(fFile);
cL = fgetc(fFile);
c = cL | (cH << 8);
if (cL == EOF) {
c = 0;
fEof = TRUE;
}
break;
}
case UTF8:
{
if (fPending2ndSurrogate != 0) {
c = fPending2ndSurrogate;
fPending2ndSurrogate = 0;
break;
}
int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
if (ch == EOF) {
c = 0;
fEof = TRUE;
break;
}
if (ch <= 0x7f) {
// It's ascii. No further utf-8 conversion.
c = ch;
break;
}
// Figure out the lenght of the char and read the rest of the bytes
// into a temp array.
int nBytes;
if (ch >= 0xF0) {nBytes=4;}
else if (ch >= 0xE0) {nBytes=3;}
else if (ch >= 0xC0) {nBytes=2;}
else {
fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
fError = TRUE;
return 0;
}
unsigned char bytes[10];
bytes[0] = (unsigned char)ch;
int i;
for (i=1; i<nBytes; i++) {
bytes[i] = fgetc(fFile);
if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
fError = TRUE;
return 0;
}
}
// Convert the bytes from the temp array to a Unicode char.
i = 0;
uint32_t cp;
U8_NEXT_UNSAFE(bytes, i, cp);
c = (UChar)cp;
if (cp >= 0x10000) {
// The code point needs to be broken up into a utf-16 surrogate pair.
// Process first half this time through the main loop, and
// remember the other half for the next time through.
UChar utf16Buf[3];
i = 0;
UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
fPending2ndSurrogate = utf16Buf[1];
c = utf16Buf[0];
}
break;
};
}
return c;
}
//----------------------------------------------------------------------------------------
//
// Main -- process command line, read in and pre-process the test file,
// call other functions to do the actual tests.
//
//----------------------------------------------------------------------------------------
int main(int argc, const char** argv) {
if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
printf(gUsageString);
exit (1);
}
// Make sure that we've only got one API selected.
if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
if (opt_mac || opt_unix) opt_win = FALSE;
if (opt_mac) opt_unix = FALSE;
UErrorCode status = U_ZERO_ERROR;
//
// Set up a Windows LCID
//
/*
if (opt_langid != 0) {
gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
}
else {
gWinLCID = uloc_getLCID(opt_locale);
}
*/
//
// Set the UNIX locale
//
if (opt_unix) {
if (setlocale(LC_ALL, opt_locale) == 0) {
fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
exit(-1);
}
}
// Read in the input file.
// File assumed to be utf-16.
// Lines go onto heap buffers. Global index array to line starts is created.
// Lines themselves are null terminated.
//
UCharFile f(opt_fName);
if (f.error()) {
exit(-1);
}
int32_t fileSize = f.size();
const int STARTSIZE = 70000;
int32_t bufSize = 0;
int32_t charCount = 0;
if(fileSize != -1) {
text = (UChar *)malloc(fileSize*sizeof(UChar));
bufSize = fileSize;
} else {
text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
bufSize = STARTSIZE;
}
if(text == NULL) {
fprintf(stderr, "Allocating buffer failed\n");
exit(-1);
}
// Read the file, split into lines, and save in memory.
// Loop runs once per utf-16 value from the input file,
// (The number of bytes read from file per loop iteration depends on external encoding.)
for (;;) {
UChar c = f.get();
if(f.eof()) {
break;
}
if (f.error()){
exit(-1);
}
// We now have a good UTF-16 value in c.
text[charCount++] = c;
if(charCount == bufSize) {
text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
if(text == NULL) {
fprintf(stderr, "Reallocating buffer failed\n");
exit(-1);
}
bufSize *= 2;
}
}
if (opt_terse == FALSE) {
printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
}
textSize = charCount;
//
// Dump file contents if requested.
//
if (opt_dump) {
// dump file, etc... possibly
}
//
// We've got the file read into memory. Go do something with it.
//
int32_t i = 0;
for(i = 0; i < opt_passesCount; i++) {
if(opt_loopCount != 0) {
if(opt_next) {
doForwardTest();
} else if(opt_isBound) {
doIsBoundTest();
} else {
doForwardTest();
}
} else if(opt_time != 0) {
}
}
if(text != NULL) {
free(text);
}
if(brkit != NULL) {
delete brkit;
}
return 0;
}