Lots more tests, and some bugfixes of utf8 code. More to come...

author: deva <deva> 2010-01-18 14:26:28 +0000
committer: deva <deva> 2010-01-18 14:26:28 +0000
commit: 2f38783e005e43efd66727371fb0607ca9dfee29 (patch)
tree: 2fa12bd2b8a87f9f33c3f52d2dc5bd35b89a1517
parent: 26d9661aebf68b656af71e5cee27a7a69943706b (diff)
1 files changed, 155 insertions, 26 deletions
diff --git a/server/src/journalwriter.cc b/server/src/journalwriter.cc
index b84018a..db6939c 100644
--- a/server/src/journalwriter.cc
+++ b/server/src/journalwriter.cc
@@ -1,4 +1,4 @@
-/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2; coding: utf-8  -*- */
 /* vim: set et sw=2 ts=2: */
 /***************************************************************************
  *            journalwriter.cc
@@ -32,7 +32,7 @@
 
 static inline bool iswhitespace(char c)
 {
-  return c == ' ' || c == '\n' || c == '\t';
+  return c == ' ' || c == '\n' || c == '\t' || c == '\r';
 }
 
 /**
@@ -44,16 +44,63 @@ static std::string stripTrailingWhitepace(std::string str)
 
   ssize_t end = str.size() - 1;
 
-  while(end && iswhitespace(str[end]) 
-        && (end>0 && str[end-1] & 0x80) == false // Make sure we are not in a utf8 character.
-        ) {
-    end--;
-  }
+  while(end >= 0 && iswhitespace(str[end])) end--;
   end++;
 
   return str.substr(0, end);
 }
 
+static bool isInsideUTF8(std::string str, size_t idx)
+{
+  // Two byte character
+  if(idx > 0 &&
+     (str[idx] & 0xC0 ) == 0x80 &&
+     (str[idx - 1] & 0xE0) == 0xC0)
+    return true;
+
+  // Three byte character
+  if(idx > 1 &&
+     (str[idx] & 0xC0 ) == 0x80 &&
+     (str[idx - 1] & 0xC0 ) == 0x80 &&
+     (str[idx - 2] & 0xF0) == 0xE0)
+    return true;
+
+  if(idx > 0 &&
+     (str[idx] & 0xC0 ) == 0x80 &&
+     (str[idx - 1] & 0xF0) == 0xE0)
+    return true;
+
+  // Four byte character
+  if(idx > 2 &&
+     (str[idx] & 0xC0 ) == 0x80 &&
+     (str[idx - 1] & 0xC0 ) == 0x80 &&
+     (str[idx - 2] & 0xC0 ) == 0x80 &&
+     (str[idx - 3] & 0xF8) == 0xF0)
+    return true;
+
+  if(idx > 1 &&
+     (str[idx] & 0xC0 ) == 0x80 &&
+     (str[idx - 1] & 0xC0 ) == 0x80 &&
+     (str[idx - 2] & 0xF8) == 0xF0)
+    return true;
+
+  if(idx > 0 &&
+     (str[idx] & 0xC0 ) == 0x80 &&
+     (str[idx - 1] & 0xF8) == 0xF0)
+    return true;
+
+  return false;
+}
+
+static size_t UTF8Length(std::string str)
+{
+  size_t size = 0;
+  for(size_t i = 0; i < str.size(); i++) {
+    if(!isInsideUTF8(str, i)) size++;
+  }
+  return size;
+}
+
 /**
  * Find all lines longer than 'width', and insert a newline in the
  * first backward occurring space.
@@ -68,20 +115,25 @@ static std::string addNewlines(std::string str, size_t width)
 
     fraction += str[i];
 
-    if(iswhitespace(str[i]) 
-       && (i>0 && str[i-1] & 0x80) == false // Make sure we are not in a utf8 character.
-       ) {
-      if(linelen + fraction.size() > width) {
+    if(isInsideUTF8(str, i)) continue;
+    
+    if(iswhitespace(str[i]) ) {
+      if(linelen + UTF8Length(fraction) - 1 > width) {
         output[output.size() - 1] = '\n';
         linelen = 0;
       }
+
       output += fraction;
-      linelen += fraction.size();
+      linelen += UTF8Length(fraction);
       fraction = "";
     }
 
     if(str[i] == '\n') linelen = 0;
+  }
 
+  if(linelen + UTF8Length(fraction) > width) {
+    output[output.size() - 1] = '\n';
+    linelen = 0;
   }
   output += fraction;
 
@@ -173,25 +225,102 @@ void JournalWriter::commit()
 }
 
 #ifdef TEST_JOURNALWRITER
-//Additional dependency files
-//deps:
-//Required cflags (autoconf vars may be used)
-//cflags:
-//Required link options (autoconf vars may be used)
+//deps: debug.cc journal_commit.cc
+//cflags: -I..
 //libs:
 #include "test.h"
 
-TEST_BEGIN;
-
-std::string text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do\neiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.           \n\n    \t";
+#define LONG "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do\neiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.           \n\n    \t";
 
-std::string resume = stripTrailingWhitepace(addNewlines(text, 60));
-printf("[%s]\n", resume.c_str());
-
-resume = stripTrailingWhitepace(addNewlines("", 60));
-printf("[%s]\n", resume.c_str());
+TEST_BEGIN;
 
-// TODO: Put some testcode here (see test.h for usable macros).
+TEST_EQUAL_STR(stripTrailingWhitepace
+               ("Lorem ipsum dolor sit amet.           \n\n    \t"),
+                "Lorem ipsum dolor sit amet.", "Test wspace remover.");
+
+TEST_EQUAL_STR(stripTrailingWhitepace(""), "", "Test wspace remover on empty string.");
+
+TEST_EQUAL_STR(stripTrailingWhitepace("\n\t "), "", "Test wspace remover on wspace-only string.");
+
+TEST_EQUAL_STR(stripTrailingWhitepace("\n"), "", "Test wspace remover on newline only.");
+TEST_EQUAL_STR(stripTrailingWhitepace("\t"), "", "Test wspace remover on tab only.");
+TEST_EQUAL_STR(stripTrailingWhitepace("\r"), "", "Test wspace remover on space only.");
+TEST_EQUAL_STR(stripTrailingWhitepace(" "), "", "Test wspace remover on space only.");
+
+TEST_EQUAL_STR(stripTrailingWhitepace("ø "), "ø", "Test wspace remover on utf-8 char.");
+TEST_EQUAL_STR(stripTrailingWhitepace("ø"), "ø", "Test wspace remover on utf-8 char only.");
+
+TEST_EQUAL_STR(stripTrailingWhitepace("a "), "a", "Test wspace remover on single char only.");
+TEST_EQUAL_STR(stripTrailingWhitepace("a"), "a", "Test wspace remover on single char only.");
+
+TEST_EQUAL_STR(addNewlines
+               ("Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do.", 60),
+                "Lorem ipsum dolor sit amet, consectetur adipisicing elit,\nsed do.",
+               "Test single linesplit.");
+
+TEST_EQUAL_STR(addNewlines
+               ("Lorem ipsum dolor sit amet, consectetur adipisicing elit, øsed do.", 60),
+                "Lorem ipsum dolor sit amet, consectetur adipisicing elit,\nøsed do.",
+               "Test single linesplit around utf-8 char.");
+
+TEST_EQUAL_STR(addNewlines
+               ("Lorem ipsum dolor sit amet, consectetur adipisicing elitø, sed do.", 60),
+                "Lorem ipsum dolor sit amet, consectetur adipisicing elitø,\nsed do.",
+               "Test single linesplit around utf-8 char.");
+
+TEST_EQUAL_STR(addNewlines
+               ("Lorem\nipsum dolor sit amet.", 12),
+                "Lorem\nipsum dolor\nsit amet.",
+               "Test single linesplit with contained newline.");
+
+TEST_EQUAL_STR(addNewlines
+               ("Lorem ipsum dolor sitan met.", 11),
+                "Lorem ipsum\ndolor sitan\nmet.",
+               "Test single linesplit on exact border.");
+
+TEST_EQUAL_STR(addNewlines
+               ("Loremipsum", 6),
+                "Loremi\npsum",
+               "Test single linesplit inside word.");
+
+TEST_EQUAL_STR(addNewlines
+               ("abc Loremipsum", 6),
+                "abc Lo\nremips\num",
+               "Test single linesplit inside word.");
+
+TEST_TRUE(isInsideUTF8("ø", 1), "Test positive utf8 match.");
+TEST_TRUE(isInsideUTF8("aæb", 2), "Test positive utf8 match.");
+TEST_TRUE(isInsideUTF8("aøb", 2), "Test positive utf8 match.");
+TEST_TRUE(isInsideUTF8("aåb", 2), "Test positive utf8 match.");
+TEST_TRUE(isInsideUTF8("aÆb", 2), "Test positive utf8 match.");
+TEST_TRUE(isInsideUTF8("aØb", 2), "Test positive utf8 match.");
+TEST_TRUE(isInsideUTF8("aÅb", 2), "Test positive utf8 match.");
+TEST_FALSE(isInsideUTF8("ø", 0), "Test negative utf8 match.");
+TEST_FALSE(isInsideUTF8("aæøb", 3), "Test negative utf8 match (between two utf8 chars).");
+TEST_FALSE(isInsideUTF8("aøb", 0), "Test negative utf8 match (before utf8 char).");
+
+TEST_FALSE(isInsideUTF8("𤭢", 0), "Test positive utf8 match, len 4.");
+TEST_TRUE(isInsideUTF8("𤭢", 1), "Test positive utf8 match, len 4.");
+TEST_TRUE(isInsideUTF8("𤭢", 2), "Test positive utf8 match, len 4.");
+TEST_TRUE(isInsideUTF8("𤭢", 3), "Test positive utf8 match, len 4.");
+
+TEST_FALSE(isInsideUTF8("€", 0), "Test positive utf8 match, len 3.");
+TEST_TRUE(isInsideUTF8("€", 1), "Test positive utf8 match, len 3.");
+TEST_TRUE(isInsideUTF8("€", 2), "Test positive utf8 match, len 3.");
+
+TEST_FALSE(isInsideUTF8("¢", 0), "Test positive utf8 match, len 2.");
+TEST_TRUE(isInsideUTF8("¢", 1), "Test positive utf8 match, len 2.");
+
+TEST_EQUAL_INT(UTF8Length("ø"), 1, "Test utf8 string length.");
+TEST_EQUAL_INT(UTF8Length("æø"), 2, "Test utf8 string length.");
+TEST_EQUAL_INT(UTF8Length(""), 0, "Test utf8 string length.");
+TEST_EQUAL_INT(UTF8Length("a"), 1, "Test utf8 string length.");
+TEST_EQUAL_INT(UTF8Length("aø"), 2, "Test utf8 string length.");
+TEST_EQUAL_INT(UTF8Length("aøb"), 3, "Test utf8 string length.");
+
+TEST_EQUAL_INT(UTF8Length("a𤭢€¢ø𤭢€¢øa"), 10, "Test utf8 string length, combi.");
+
+TEST_EQUAL_STR(stripTrailingWhitepace(addNewlines("", 60)), "", "Test on empty input.");
 
 TEST_END;
author	deva <deva>	2010-01-18 14:26:28 +0000
committer	deva <deva>	2010-01-18 14:26:28 +0000
commit	2f38783e005e43efd66727371fb0607ca9dfee29 (patch)
tree	2fa12bd2b8a87f9f33c3f52d2dc5bd35b89a1517
parent	26d9661aebf68b656af71e5cee27a7a69943706b (diff)