rewrite ReadMessages()

Central methods of our infrastructure like this one responsible for communication with our methods shouldn't be more complicated then they have to and not claim to have (albeit unlikely) bugs. While I am not sure about having improved the first part, the bug is now gone and a few explicit tests check that it stays that way, so nobody will notice the difference (hopefully) – expect that this should a very tiny bit faster as well as we don't manually proceed through the string. Git-Dch: Ignore
author: David Kalnischkies <david@kalnischkies.de> 2014-10-24 23:55:15 +0200
committer: David Kalnischkies <david@kalnischkies.de> 2014-10-26 14:13:34 +0100
commit: d8c71b3b5dc98daa247433503ad8242c9e7b77db (patch)
tree: 64d520c6b863554d0c593bedbcc010ce5c0a5a0b
parent: 23397c9d7d4d455461176600bb45c81185493504 (diff)
2 files changed, 136 insertions, 58 deletions
diff --git a/apt-pkg/contrib/strutl.cc b/apt-pkg/contrib/strutl.cc
index aad358a55..ebf9c9ea6 100644
--- a/apt-pkg/contrib/strutl.cc
+++ b/apt-pkg/contrib/strutl.cc
@@ -779,86 +779,94 @@ string TimeRFC1123(time_t Date)
 
    In particular: this reads blocks from the input until it believes
    that it's run out of input text.  Each block is terminated by a
-   double newline ('\n' followed by '\n').  As noted below, there is a
-   bug in this code: it assumes that all the blocks have been read if
-   it doesn't see additional text in the buffer after the last one is
-   parsed, which will cause it to lose blocks if the last block
-   coincides with the end of the buffer.
+   double newline ('\n' followed by '\n').
  */
 bool ReadMessages(int Fd, vector<string> &List)
 {
    char Buffer[64000];
-   char *End = Buffer;
    // Represents any left-over from the previous iteration of the
    // parse loop.  (i.e., if a message is split across the end
    // of the buffer, it goes here)
    string PartialMessage;
-   
-   while (1)
-   {
-      int Res = read(Fd,End,sizeof(Buffer) - (End-Buffer));
+
+   do {
+      int const Res = read(Fd, Buffer, sizeof(Buffer));
       if (Res < 0 && errno == EINTR)
 	 continue;
-      
-      // Process is dead, this is kind of bad..
+
+      // process we read from has died
       if (Res == 0)
 	 return false;
-      
+
       // No data
-      if (Res < 0 && errno == EAGAIN)
+      if (Res < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
 	 return true;
       if (Res < 0)
 	 return false;
-			      
-      End += Res;
-      
-      // Look for the end of the message
-      for (char *I = Buffer; I + 1 < End; I++)
+
+      // extract the message(s) from the buffer
+      char const *Start = Buffer;
+      char const * const End = Buffer + Res;
+
+      char const * NL = (char const *) memchr(Start, '\n', End - Start);
+      if (NL == NULL)
       {
-	 if (I[1] != '\n' ||
-	       (I[0] != '\n' && strncmp(I, "\r\n\r\n", 4) != 0))
-	    continue;
-	 
-	 // Pull the message out
-	 string Message(Buffer,I-Buffer);
-	 PartialMessage += Message;
-
-	 // Fix up the buffer
-	 for (; I < End && (*I == '\n' || *I == '\r'); ++I);
-	 End -= I-Buffer;	 
-	 memmove(Buffer,I,End-Buffer);
-	 I = Buffer;
-	 
-	 List.push_back(PartialMessage);
-	 PartialMessage.clear();
+	 // end of buffer: store what we have so far and read new data in
+	 PartialMessage.append(Start, End - Start);
+	 Start = End;
       }
-      if (End != Buffer)
-	{
-	  // If there's text left in the buffer, store it
-	  // in PartialMessage and throw the rest of the buffer
-	  // away.  This allows us to handle messages that
-	  // are longer than the static buffer size.
-	  PartialMessage += string(Buffer, End);
-	  End = Buffer;
-	}
       else
-	{
-	  // BUG ALERT: if a message block happens to end at a
-	  // multiple of 64000 characters, this will cause it to
-	  // terminate early, leading to a badly formed block and
-	  // probably crashing the method.  However, this is the only
-	  // way we have to find the end of the message block.  I have
-	  // an idea of how to fix this, but it will require changes
-	  // to the protocol (essentially to mark the beginning and
-	  // end of the block).
-	  //
-	  //  -- dburrows 2008-04-02
-	  return true;
-	}
+	 ++NL;
+
+      if (PartialMessage.empty() == false && Start < End)
+      {
+	 // if we start with a new line, see if the partial message we have ended with one
+	 // so that we properly detect records ending between two read() runs
+	 // cases are: \n|\n  ,  \r\n|\r\n  and  \r\n\r|\n
+	 // the case \r|\n\r\n is handled by the usual double-newline handling
+	 if ((NL - Start) == 1 || ((NL - Start) == 2 && *Start == '\r'))
+	 {
+	    if (APT::String::Endswith(PartialMessage, "\n") || APT::String::Endswith(PartialMessage, "\r\n\r"))
+	    {
+	       PartialMessage.erase(PartialMessage.find_last_not_of("\r\n") + 1);
+	       List.push_back(PartialMessage);
+	       PartialMessage.clear();
+	       while (NL < End && (*NL == '\n' || *NL == '\r')) ++NL;
+	       Start = NL;
+	    }
+	 }
+      }
+
+      while (Start < End) {
+	 char const * NL2 = (char const *) memchr(NL, '\n', End - NL);
+	 if (NL2 == NULL)
+	 {
+	    // end of buffer: store what we have so far and read new data in
+	    PartialMessage.append(Start, End - Start);
+	    break;
+	 }
+	 ++NL2;
+
+	 // did we find a double newline?
+	 if ((NL2 - NL) == 1 || ((NL2 - NL) == 2 && *NL == '\r'))
+	 {
+	    PartialMessage.append(Start, NL2 - Start);
+	    PartialMessage.erase(PartialMessage.find_last_not_of("\r\n") + 1);
+	    List.push_back(PartialMessage);
+	    PartialMessage.clear();
+	    while (NL2 < End && (*NL2 == '\n' || *NL2 == '\r')) ++NL2;
+	    Start = NL2;
+	 }
+	 NL = NL2;
+      }
+
+      // we have read at least one complete message and nothing left
+      if (PartialMessage.empty() == true)
+	 return true;
 
       if (WaitFd(Fd) == false)
 	 return false;
-   }   
+   } while (true);
 }
 									/*}}}*/
 // MonthConv - Converts a month string into a number			/*{{{*/
diff --git a/test/libapt/strutil_test.cc b/test/libapt/strutil_test.cc
index 494159c87..9bc3c76fd 100644
--- a/test/libapt/strutil_test.cc
+++ b/test/libapt/strutil_test.cc
@@ -1,10 +1,13 @@
 #include <config.h>
 #include <apt-pkg/strutl.h>
+#include <apt-pkg/fileutl.h>
 #include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
 
+#include "file-helpers.h"
+
 TEST(StrUtilTest,DeEscapeString)
 {
    // nothing special
@@ -143,3 +146,70 @@ TEST(StrUtilTest,Base64Encode)
    EXPECT_EQ("Lg==", Base64Encode("."));
    EXPECT_EQ("", Base64Encode(""));
 }
+void ReadMessagesTestWithNewLine(char const * const nl, char const * const ab)
+{
+   SCOPED_TRACE(SubstVar(SubstVar(nl, "\n", "n"), "\r", "r") + " # " + ab);
+   FileFd fd;
+   std::string pkgA = "Package: pkgA\n"
+      "Version: 1\n"
+      "Size: 100\n"
+      "Description: aaa\n"
+      " aaa";
+   std::string pkgB = "Package: pkgB\n"
+      "Version: 1\n"
+      "Flag: no\n"
+      "Description: bbb";
+   std::string pkgC = "Package: pkgC\n"
+      "Version: 2\n"
+      "Flag: yes\n"
+      "Description:\n"
+      " ccc";
+
+   createTemporaryFile("readmessage", fd, NULL, (pkgA + nl + pkgB + nl + pkgC + nl).c_str());
+   std::vector<std::string> list;
+   EXPECT_TRUE(ReadMessages(fd.Fd(), list));
+   EXPECT_EQ(3, list.size());
+   EXPECT_EQ(pkgA, list[0]);
+   EXPECT_EQ(pkgB, list[1]);
+   EXPECT_EQ(pkgC, list[2]);
+
+   size_t const msgsize = 63990;
+   createTemporaryFile("readmessage", fd, NULL, NULL);
+   for (size_t j = 0; j < msgsize; ++j)
+      fd.Write(ab, strlen(ab));
+   for (size_t i = 0; i < 21; ++i)
+   {
+      std::string msg;
+      strprintf(msg, "msgsize=%zu  i=%zu", msgsize, i);
+      SCOPED_TRACE(msg);
+      fd.Seek((msgsize + (i - 1)) * strlen(ab));
+      fd.Write(ab, strlen(ab));
+      fd.Write(nl, strlen(nl));
+      fd.Seek(0);
+      list.clear();
+      EXPECT_TRUE(ReadMessages(fd.Fd(), list));
+      EXPECT_EQ(1, list.size());
+      EXPECT_EQ((msgsize + i) * strlen(ab), list[0].length());
+      EXPECT_EQ(std::string::npos, list[0].find_first_not_of(ab));
+   }
+
+   list.clear();
+   fd.Write(pkgA.c_str(), pkgA.length());
+   fd.Write(nl, strlen(nl));
+   fd.Seek(0);
+   EXPECT_TRUE(ReadMessages(fd.Fd(), list));
+   EXPECT_EQ(2, list.size());
+   EXPECT_EQ((msgsize + 20) * strlen(ab), list[0].length());
+   EXPECT_EQ(std::string::npos, list[0].find_first_not_of(ab));
+   EXPECT_EQ(pkgA, list[1]);
+
+
+   fd.Close();
+}
+TEST(StrUtilTest,ReadMessages)
+{
+   ReadMessagesTestWithNewLine("\n\n", "a");
+   ReadMessagesTestWithNewLine("\r\n\r\n", "a");
+   ReadMessagesTestWithNewLine("\n\n", "ab");
+   ReadMessagesTestWithNewLine("\r\n\r\n", "ab");
+}
author	David Kalnischkies <david@kalnischkies.de>	2014-10-24 23:55:15 +0200
committer	David Kalnischkies <david@kalnischkies.de>	2014-10-26 14:13:34 +0100
commit	d8c71b3b5dc98daa247433503ad8242c9e7b77db (patch)
tree	64d520c6b863554d0c593bedbcc010ce5c0a5a0b
parent	23397c9d7d4d455461176600bb45c81185493504 (diff)