Browse Source

wstring utils

David Rose 20 years ago
parent
commit
56b25dafe6

+ 20 - 0
panda/src/express/textEncoder.I

@@ -371,6 +371,26 @@ unicode_isupper(int character) {
   return entry->_char_type == UnicodeLatinMap::CT_upper;
 }
 
+////////////////////////////////////////////////////////////////////
+//     Function: TextEncoder::unicode_isspace
+//       Access: Published, Static
+//  Description: Returns true if the indicated character is a
+//               whitespace letter, false otherwise.  This is akin to
+//               ctype's isspace(), extended to Unicode.
+////////////////////////////////////////////////////////////////////
+INLINE bool TextEncoder::
+unicode_isspace(int character) {
+  switch (character) {
+  case ' ':
+  case '\t':
+  case '\n':
+    return true;
+
+  default:
+    return false;
+  }
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: TextEncoder::unicode_islower
 //       Access: Published, Static

+ 1 - 0
panda/src/express/textEncoder.h

@@ -79,6 +79,7 @@ PUBLISHED:
   INLINE static bool unicode_ispunct(int character);
   INLINE static bool unicode_islower(int character);
   INLINE static bool unicode_isupper(int character);
+  INLINE static bool unicode_isspace(int character);
   INLINE static int unicode_toupper(int character);
   INLINE static int unicode_tolower(int character);
 

+ 114 - 0
panda/src/putil/string_utils.cxx

@@ -17,6 +17,7 @@
 ////////////////////////////////////////////////////////////////////
 
 #include "string_utils.h"
+#include "textEncoder.h"
 
 #include <ctype.h>
 
@@ -131,6 +132,40 @@ extract_words(const string &str, vector_string &words) {
   return num_words;
 }
 
+////////////////////////////////////////////////////////////////////
+//     Function: extract_words
+//  Description: Divides the string into a number of words according
+//               to whitespace.  The words vector should be cleared by
+//               the user before calling; otherwise, the list of words
+//               in the string will be appended to the end of whatever
+//               was there before.
+//
+//               The return value is the number of words extracted.
+////////////////////////////////////////////////////////////////////
+int
+extract_words(const wstring &str, pvector<wstring> &words) {
+  int num_words = 0;
+
+  size_t pos = 0;
+  while (pos < str.length() && TextEncoder::unicode_isspace(str[pos])) {
+    pos++;
+  }
+  while (pos < str.length()) {
+    size_t word_start = pos;
+    while (pos < str.length() && !TextEncoder::unicode_isspace(str[pos])) {
+      pos++;
+    }
+    words.push_back(str.substr(word_start, pos - word_start));
+    num_words++;
+
+    while (pos < str.length() && TextEncoder::unicode_isspace(str[pos])) {
+      pos++;
+    }
+  }
+
+  return num_words;
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: tokenize
 //  Description: Chops the source string up into pieces delimited by
@@ -158,6 +193,33 @@ tokenize(const string &str, vector_string &words, const string &delimiters) {
   words.push_back(string());
 }
 
+////////////////////////////////////////////////////////////////////
+//     Function: tokenize
+//  Description: Chops the source string up into pieces delimited by
+//               any of the characters specified in delimiters.
+//               Repeated delimiter characters represent zero-length
+//               tokens.
+//
+//               It is the user's responsibility to ensure the output
+//               vector is cleared before calling this function; the
+//               results will simply be appended to the end of the
+//               vector.
+////////////////////////////////////////////////////////////////////
+void
+tokenize(const wstring &str, pvector<wstring> &words, const wstring &delimiters) {
+  size_t p = 0;
+  while (p < str.length()) {
+    size_t q = str.find_first_of(delimiters, p);
+    if (q == string::npos) {
+      words.push_back(str.substr(p));
+      return;
+    }
+    words.push_back(str.substr(p, q - p));
+    p = q + 1;
+  }
+  words.push_back(wstring());
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: trim_left
 //  Description: Returns a new string representing the contents of the
@@ -173,6 +235,21 @@ trim_left(const string &str) {
   return str.substr(begin);
 }
 
+////////////////////////////////////////////////////////////////////
+//     Function: trim_left
+//  Description: Returns a new string representing the contents of the
+//               given string with the leading whitespace removed.
+////////////////////////////////////////////////////////////////////
+wstring
+trim_left(const wstring &str) {
+  size_t begin = 0;
+  while (begin < str.size() && TextEncoder::unicode_isspace(str[begin])) {
+    begin++;
+  }
+
+  return str.substr(begin);
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: trim_right
 //  Description: Returns a new string representing the contents of the
@@ -189,6 +266,22 @@ trim_right(const string &str) {
   return str.substr(begin, end - begin);
 }
 
+////////////////////////////////////////////////////////////////////
+//     Function: trim_right
+//  Description: Returns a new string representing the contents of the
+//               given string with the trailing whitespace removed.
+////////////////////////////////////////////////////////////////////
+wstring
+trim_right(const wstring &str) {
+  size_t begin = 0;
+  size_t end = str.size();
+  while (end > begin && TextEncoder::unicode_isspace(str[end - 1])) {
+    end--;
+  }
+
+  return str.substr(begin, end - begin);
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: trim
 //  Description: Returns a new string representing the contents of the
@@ -210,6 +303,27 @@ trim(const string &str) {
   return str.substr(begin, end - begin);
 }
 
+////////////////////////////////////////////////////////////////////
+//     Function: trim
+//  Description: Returns a new string representing the contents of the
+//               given string with both leading and trailing
+//               whitespace removed.
+////////////////////////////////////////////////////////////////////
+wstring
+trim(const wstring &str) {
+  size_t begin = 0;
+  while (begin < str.size() && TextEncoder::unicode_isspace(str[begin])) {
+    begin++;
+  }
+
+  size_t end = str.size();
+  while (end > begin && TextEncoder::unicode_isspace(str[end - 1])) {
+    end--;
+  }
+
+  return str.substr(begin, end - begin);
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: string_to_int
 //  Description: A string-interface wrapper around the C library

+ 6 - 0
panda/src/putil/string_utils.h

@@ -39,15 +39,21 @@ EXPCL_PANDA string upcase(const string &s);
 
 // Separates the string into words according to whitespace.
 EXPCL_PANDA int extract_words(const string &str, vector_string &words);
+EXPCL_PANDA int extract_words(const wstring &str, pvector<wstring> &words);
 
 // Separates the string into words according to the indicated delimiters.
 EXPCL_PANDA void tokenize(const string &str, vector_string &words,
                           const string &delimiters);
+EXPCL_PANDA void tokenize(const wstring &str, pvector<wstring> &words,
+                          const wstring &delimiters);
 
 // Trims leading and/or trailing whitespace from the string.
 EXPCL_PANDA string trim_left(const string &str);
+EXPCL_PANDA wstring trim_left(const wstring &str);
 EXPCL_PANDA string trim_right(const string &str);
+EXPCL_PANDA wstring trim_right(const wstring &str);
 EXPCL_PANDA string trim(const string &str);
+EXPCL_PANDA wstring trim(const wstring &str);
 
 // Functions to parse numeric values out of a string.
 EXPCL_PANDA int string_to_int(const string &str, string &tail);