summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Kalnischkies <david@kalnischkies.de>2016-01-02 12:25:29 +0100
committerDavid Kalnischkies <david@kalnischkies.de>2016-01-02 16:19:40 +0100
commit55153bf94ff28a23318e79aa48242244c4d82b3c (patch)
tree5785ef3cffe2d70ed3e0a581a016b1ed1c3f8cdd
parent5f1b8fadbba7108ba20bd07c7479eb5e5704308e (diff)
add optional support for comments in pkgTagFile
APT usually deals with perfectly formatted files generated automatically be other programs – and as it has to parse multiple MBs of such files it tries to be fast rather than forgiving. This was always a problem if we reused this parser for files with a deb822 syntax which are mostly written by hand however, like apt_preferences or the deb822-style sources as these can include stray newlines and more importantly comments all over the place. As a stopgap we had pkgUserTagSection which deals at least with comments before and after a given stanza, but comments in between weren't really supported and now that we support parsing debian/control for e.g. build-dep we face the full comment problem e.g. with comments inbetween multi-line fields (like Build-Depends). We can't easily deal with this on the pkgTagSection level as the interface gives access to 'raw' char-pointers for performance reasons so we would need to optionally add a buffer here on which we could remove comments to hand out pointers into this buffer instead. The interface is quite large already and supports writing stanzas as well, which does not support comments at all either. So while in future it might make sense to have a parser setup which deals with and keeps comments in this commit we opt for the simpler solution for now: We officially declare that pkgTagSection does not support comments and instead expect the caller to deal with them, which in our case is pkgTagFile: pkgTagFile is extended with an additional mode which can deal with comments by dropping them from the buffer which will later form the input of pkgTagSection. The actual implementation is slightly more complex than this sentence suggests at first on one hand to have good performance and on the other to allow jumping directly to stanzas with offsets collected in a previous run (like our cache generation does it for example).
-rw-r--r--apt-pkg/tagfile.cc249
-rw-r--r--apt-pkg/tagfile.h23
-rw-r--r--test/libapt/tagfile_test.cc61
3 files changed, 290 insertions, 43 deletions
diff --git a/apt-pkg/tagfile.cc b/apt-pkg/tagfile.cc
index d5e61baf4..7a263c2b7 100644
--- a/apt-pkg/tagfile.cc
+++ b/apt-pkg/tagfile.cc
@@ -18,6 +18,8 @@
#include <apt-pkg/strutl.h>
#include <apt-pkg/fileutl.h>
+#include <list>
+
#include <string>
#include <stdio.h>
#include <ctype.h>
@@ -29,33 +31,45 @@
using std::string;
-class pkgTagFilePrivate
+class APT_HIDDEN pkgTagFilePrivate /*{{{*/
{
public:
- void Reset(FileFd * const pFd, unsigned long long const pSize)
+ void Reset(FileFd * const pFd, unsigned long long const pSize, pkgTagFile::Flags const pFlags)
{
if (Buffer != NULL)
free(Buffer);
Buffer = NULL;
Fd = pFd;
+ Flags = pFlags;
Start = NULL;
End = NULL;
Done = false;
iOffset = 0;
Size = pSize;
+ isCommentedLine = false;
+ chunks.clear();
}
- pkgTagFilePrivate(FileFd * const pFd, unsigned long long const Size) : Buffer(NULL)
+ pkgTagFilePrivate(FileFd * const pFd, unsigned long long const Size, pkgTagFile::Flags const pFlags) : Buffer(NULL)
{
- Reset(pFd, Size);
+ Reset(pFd, Size, pFlags);
}
FileFd * Fd;
+ pkgTagFile::Flags Flags;
char *Buffer;
char *Start;
char *End;
bool Done;
unsigned long long iOffset;
unsigned long long Size;
+ bool isCommentedLine;
+ struct FileChunk
+ {
+ bool const good;
+ size_t length;
+ FileChunk(bool const pgood, size_t const plength) : good(pgood), length(plength) {}
+ };
+ std::list<FileChunk> chunks;
~pkgTagFilePrivate()
{
@@ -63,8 +77,8 @@ public:
free(Buffer);
}
};
-
-class pkgTagSectionPrivate
+ /*}}}*/
+class APT_HIDDEN pkgTagSectionPrivate /*{{{*/
{
public:
pkgTagSectionPrivate()
@@ -80,6 +94,7 @@ public:
};
std::vector<TagData> Tags;
};
+ /*}}}*/
static unsigned long AlphaHash(const char *Text, size_t Length) /*{{{*/
{
@@ -98,21 +113,23 @@ static unsigned long AlphaHash(const char *Text, size_t Length) /*{{{*/
/*}}}*/
// TagFile::pkgTagFile - Constructor /*{{{*/
-// ---------------------------------------------------------------------
-/* */
+pkgTagFile::pkgTagFile(FileFd * const pFd,pkgTagFile::Flags const pFlags, unsigned long long const Size)
+ : d(new pkgTagFilePrivate(pFd, Size + 4, pFlags))
+{
+ Init(pFd, pFlags, Size);
+}
pkgTagFile::pkgTagFile(FileFd * const pFd,unsigned long long const Size)
- : d(new pkgTagFilePrivate(pFd, Size + 4))
+ : pkgTagFile(pFd, pkgTagFile::STRICT, Size)
{
- Init(pFd, Size);
}
-void pkgTagFile::Init(FileFd * const pFd,unsigned long long Size)
+void pkgTagFile::Init(FileFd * const pFd, pkgTagFile::Flags const pFlags, unsigned long long Size)
{
/* The size is increased by 4 because if we start with the Size of the
filename we need to try to read 1 char more to see an EOF faster, 1
char the end-pointer can be on and maybe 2 newlines need to be added
to the end of the file -> 4 extra chars */
Size += 4;
- d->Reset(pFd, Size);
+ d->Reset(pFd, Size, pFlags);
if (d->Fd->IsOpen() == false)
d->Start = d->End = d->Buffer = 0;
@@ -129,10 +146,12 @@ void pkgTagFile::Init(FileFd * const pFd,unsigned long long Size)
if (d->Done == false)
Fill();
}
+void pkgTagFile::Init(FileFd * const pFd,unsigned long long Size)
+{
+ Init(pFd, pkgTagFile::STRICT, Size);
+}
/*}}}*/
// TagFile::~pkgTagFile - Destructor /*{{{*/
-// ---------------------------------------------------------------------
-/* */
pkgTagFile::~pkgTagFile()
{
delete d;
@@ -162,7 +181,7 @@ bool pkgTagFile::Resize(unsigned long long const newSize)
unsigned long long const EndSize = d->End - d->Start;
// get new buffer and use it
- char* newBuffer = (char*)realloc(d->Buffer, sizeof(char) * newSize);
+ char* const newBuffer = static_cast<char*>(realloc(d->Buffer, sizeof(char) * newSize));
if (newBuffer == NULL)
return false;
d->Buffer = newBuffer;
@@ -199,8 +218,35 @@ bool pkgTagFile::Step(pkgTagSection &Tag)
} while (Tag.Scan(d->Start,d->End - d->Start, false) == false);
}
- d->Start += Tag.size();
- d->iOffset += Tag.size();
+ size_t tagSize = Tag.size();
+ d->Start += tagSize;
+
+ if ((d->Flags & pkgTagFile::SUPPORT_COMMENTS) == 0)
+ d->iOffset += tagSize;
+ else
+ {
+ auto first = d->chunks.begin();
+ for (; first != d->chunks.end(); ++first)
+ {
+ if (first->good == false)
+ d->iOffset += first->length;
+ else
+ {
+ if (tagSize < first->length)
+ {
+ first->length -= tagSize;
+ d->iOffset += tagSize;
+ break;
+ }
+ else
+ {
+ tagSize -= first->length;
+ d->iOffset += first->length;
+ }
+ }
+ }
+ d->chunks.erase(d->chunks.begin(), first);
+ }
Tag.Trim();
return true;
@@ -210,49 +256,166 @@ bool pkgTagFile::Step(pkgTagSection &Tag)
// ---------------------------------------------------------------------
/* This takes the bit at the end of the buffer and puts it at the start
then fills the rest from the file */
+static bool FillBuffer(pkgTagFilePrivate * const d)
+{
+ unsigned long long Actual = 0;
+ // See if only a bit of the file is left
+ unsigned long long const dataSize = d->Size - ((d->End - d->Buffer) + 1);
+ if (d->Fd->Read(d->End, dataSize, &Actual) == false)
+ return false;
+ if (Actual != dataSize)
+ d->Done = true;
+ d->End += Actual;
+ return true;
+}
+static void RemoveCommentsFromBuffer(pkgTagFilePrivate * const d)
+{
+ // look for valid comments in the buffer
+ char * good_start = nullptr, * bad_start = nullptr;
+ char * current = d->Start;
+ if (d->isCommentedLine == false)
+ {
+ if (d->Start == d->Buffer)
+ {
+ // the start of the buffer is a newline as a record can't start
+ // in the middle of a line by definition.
+ if (*d->Start == '#')
+ {
+ d->isCommentedLine = true;
+ ++current;
+ if (current > d->End)
+ d->chunks.emplace_back(false, 1);
+ }
+ }
+ if (d->isCommentedLine == false)
+ good_start = d->Start;
+ else
+ bad_start = d->Start;
+ }
+ else
+ bad_start = d->Start;
+
+ std::vector<std::pair<char*, size_t>> good_parts;
+ while (current <= d->End)
+ {
+ size_t const restLength = (d->End - current) + 1;
+ if (d->isCommentedLine == false)
+ {
+ current = static_cast<char*>(memchr(current, '#', restLength));
+ if (current == nullptr)
+ {
+ size_t const goodLength = d->End - good_start;
+ d->chunks.emplace_back(true, goodLength);
+ if (good_start != d->Start)
+ good_parts.push_back(std::make_pair(good_start, goodLength));
+ break;
+ }
+ bad_start = current;
+ --current;
+ // ensure that this is really a comment and not a '#' in the middle of a line
+ if (*current == '\n')
+ {
+ size_t const goodLength = (current - good_start) + 1;
+ d->chunks.emplace_back(true, goodLength);
+ good_parts.push_back(std::make_pair(good_start, goodLength));
+ good_start = nullptr;
+ d->isCommentedLine = true;
+ }
+ current += 2;
+ }
+ else // the current line is a comment
+ {
+ current = static_cast<char*>(memchr(current, '\n', restLength));
+ if (current == nullptr)
+ {
+ d->chunks.emplace_back(false, (d->End - bad_start));
+ break;
+ }
+ ++current;
+ // is the next line a comment, too?
+ if (current > d->End || *current != '#')
+ {
+ d->chunks.emplace_back(false, (current - bad_start));
+ good_start = current;
+ bad_start = nullptr;
+ d->isCommentedLine = false;
+ }
+ ++current;
+ }
+ }
+
+ if (good_parts.empty() == false)
+ {
+ // we found comments, so move later parts over them
+ current = d->Start;
+ for (auto const &good: good_parts)
+ {
+ memmove(current, good.first, good.second);
+ current += good.second;
+ }
+ d->End = current;
+ }
+
+ if (d->isCommentedLine == true)
+ {
+ // deal with a buffer containing only comments
+ // or an (unfinished) comment at the end
+ if (good_parts.empty() == true)
+ d->End = d->Start;
+ else
+ d->Start = d->End;
+ }
+ else
+ {
+ // the buffer was all comment, but ended with the buffer
+ if (good_parts.empty() == true && good_start >= d->End)
+ d->End = d->Start;
+ else
+ d->Start = d->End;
+ }
+}
bool pkgTagFile::Fill()
{
- unsigned long long EndSize = d->End - d->Start;
+ unsigned long long const EndSize = d->End - d->Start;
+ if (EndSize != 0)
+ {
+ memmove(d->Buffer,d->Start,EndSize);
+ d->Start = d->End = d->Buffer + EndSize;
+ }
+ else
+ d->Start = d->End = d->Buffer;
+
unsigned long long Actual = 0;
-
- memmove(d->Buffer,d->Start,EndSize);
- d->Start = d->Buffer;
- d->End = d->Buffer + EndSize;
-
- if (d->Done == false)
+ while (d->Done == false && d->Size > (Actual + 1))
{
- // See if only a bit of the file is left
- unsigned long long const dataSize = d->Size - ((d->End - d->Buffer) + 1);
- if (d->Fd->Read(d->End, dataSize, &Actual) == false)
+ if (FillBuffer(d) == false)
return false;
- if (Actual != dataSize)
- d->Done = true;
- d->End += Actual;
+ if ((d->Flags & pkgTagFile::SUPPORT_COMMENTS) != 0)
+ RemoveCommentsFromBuffer(d);
+ Actual = d->End - d->Buffer;
}
-
+ d->Start = d->Buffer;
+
if (d->Done == true)
{
if (EndSize <= 3 && Actual == 0)
return false;
if (d->Size - (d->End - d->Buffer) < 4)
return true;
-
+
// Append a double new line if one does not exist
unsigned int LineCount = 0;
for (const char *E = d->End - 1; E - d->End < 6 && (*E == '\n' || *E == '\r'); E--)
if (*E == '\n')
- LineCount++;
+ ++LineCount;
if (LineCount < 2)
{
- if ((unsigned)(d->End - d->Buffer) >= d->Size)
+ if (static_cast<unsigned long long>(d->End - d->Buffer) >= d->Size)
Resize(d->Size + 3);
- for (; LineCount < 2; LineCount++)
+ for (; LineCount < 2; ++LineCount)
*d->End++ = '\n';
}
-
- return true;
}
-
return true;
}
/*}}}*/
@@ -262,8 +425,9 @@ bool pkgTagFile::Fill()
that is there */
bool pkgTagFile::Jump(pkgTagSection &Tag,unsigned long long Offset)
{
+ if ((d->Flags & pkgTagFile::SUPPORT_COMMENTS) == 0 &&
// We are within a buffer space of the next hit..
- if (Offset >= d->iOffset && d->iOffset + (d->End - d->Start) > Offset)
+ Offset >= d->iOffset && d->iOffset + (d->End - d->Start) > Offset)
{
unsigned long long Dist = Offset - d->iOffset;
d->Start += Dist;
@@ -281,7 +445,9 @@ bool pkgTagFile::Jump(pkgTagSection &Tag,unsigned long long Offset)
if (d->Fd->Seek(Offset) == false)
return false;
d->End = d->Start = d->Buffer;
-
+ d->isCommentedLine = false;
+ d->chunks.clear();
+
if (Fill() == false)
return false;
@@ -644,11 +810,12 @@ bool pkgTagSection::FindFlag(unsigned long &Flags, unsigned long Flag,
return true;
}
/*}}}*/
-void pkgTagSection::Get(const char *&Start,const char *&Stop,unsigned int I) const
+void pkgTagSection::Get(const char *&Start,const char *&Stop,unsigned int I) const/*{{{*/
{
Start = Section + d->Tags[I].StartTag;
Stop = Section + d->Tags[I+1].StartTag;
}
+ /*}}}*/
APT_PURE unsigned int pkgTagSection::Count() const { /*{{{*/
if (d->Tags.empty() == true)
return 0;
diff --git a/apt-pkg/tagfile.h b/apt-pkg/tagfile.h
index 922b56f8e..2f26846ef 100644
--- a/apt-pkg/tagfile.h
+++ b/apt-pkg/tagfile.h
@@ -34,7 +34,15 @@
class FileFd;
class pkgTagSectionPrivate;
+class pkgTagFilePrivate;
+/** \class pkgTagSection parses a single deb822 stanza and provides various Find methods
+ * to extract the included values. It can also be used to modify and write a
+ * valid deb822 stanza optionally (re)ordering the fields inside the stanza.
+ *
+ * Beware: This class does \b NOT support (#-)comments in in- or output!
+ * If the input contains comments they have to be stripped first like pkgTagFile
+ * does with SUPPORT_COMMENTS flag set. */
class pkgTagSection
{
const char *Section;
@@ -139,7 +147,10 @@ class pkgUserTagSection : public pkgTagSection
virtual void TrimRecord(bool BeforeRecord, const char* &End) APT_OVERRIDE;
};
-class pkgTagFilePrivate;
+/** \class pkgTagFile reads and prepares a deb822 formatted file for parsing
+ * via #pkgTagSection. The default mode tries to be as fast as possible and
+ * assumes perfectly valid (machine generated) files like Packages. Support
+ * for comments e.g. needs to be enabled explicitly. */
class pkgTagFile
{
pkgTagFilePrivate * const d;
@@ -148,14 +159,22 @@ class pkgTagFile
APT_HIDDEN bool Resize();
APT_HIDDEN bool Resize(unsigned long long const newSize);
- public:
+public:
bool Step(pkgTagSection &Section);
unsigned long Offset();
bool Jump(pkgTagSection &Tag,unsigned long long Offset);
+ enum Flags
+ {
+ STRICT = 0,
+ SUPPORT_COMMENTS = 1 << 0,
+ };
+
+ void Init(FileFd * const F, pkgTagFile::Flags const Flags, unsigned long long Size = 32*1024);
void Init(FileFd * const F,unsigned long long const Size = 32*1024);
+ pkgTagFile(FileFd * const F, pkgTagFile::Flags const Flags, unsigned long long Size = 32*1024);
pkgTagFile(FileFd * const F,unsigned long long Size = 32*1024);
virtual ~pkgTagFile();
};
diff --git a/test/libapt/tagfile_test.cc b/test/libapt/tagfile_test.cc
index 064e91b79..24fbd389e 100644
--- a/test/libapt/tagfile_test.cc
+++ b/test/libapt/tagfile_test.cc
@@ -40,6 +40,10 @@ TEST(TagFileTest,SingleField)
EXPECT_EQ(0, section.Count());
EXPECT_FALSE(section.Exists("FieldA-12345678"));
EXPECT_FALSE(section.Exists("FieldB-12345678"));
+
+ createTemporaryFile("emptyfile", fd, NULL, NULL);
+ ASSERT_FALSE(tfile.Step(section));
+ EXPECT_EQ(0, section.Count());
}
TEST(TagFileTest,MultipleSections)
@@ -222,3 +226,60 @@ TEST(TagFileTest, SpacesEverywhere)
// overridden values are still present, but not really accessible
EXPECT_EQ(12, section.Count());
}
+
+TEST(TagFileTest, Comments)
+{
+ FileFd fd;
+ createTemporaryFile("commentfile", fd, NULL, "# Leading comments should be ignored.\n"
+"\n"
+"Source: foo\n"
+"#Package: foo\n"
+"Section: bar\n"
+"#Section: overriden\n"
+"Priority: optional\n"
+"Build-Depends: debhelper,\n"
+"# apt-utils, (temporarily disabled)\n"
+" apt\n"
+"\n"
+"# Comments in the middle shouldn't result in extra blank paragraphs either.\n"
+"\n"
+"# Ditto.\n"
+"\n"
+"# A comment at the top of a paragraph should be ignored.\n"
+"Package: foo\n"
+"Architecture: any\n"
+"Description: An awesome package\n"
+" # This should still appear in the result.\n"
+"# this one shouldn't\n"
+" Blah, blah, blah. # but this again.\n"
+"# A comment at the end of a paragraph should be ignored.\n"
+"\n"
+"# Trailing comments shouldn't cause extra blank paragraphs."
+ );
+
+ pkgTagFile tfile(&fd, pkgTagFile::SUPPORT_COMMENTS, 1);
+ pkgTagSection section;
+ EXPECT_TRUE(tfile.Step(section));
+ EXPECT_FALSE(section.Exists("Package"));
+ EXPECT_TRUE(section.Exists("Source"));
+ EXPECT_EQ("foo", section.FindS("Source"));
+ EXPECT_TRUE(section.Exists("Section"));
+ EXPECT_EQ("bar", section.FindS("Section"));
+ EXPECT_TRUE(section.Exists("Priority"));
+ EXPECT_EQ("optional", section.FindS("Priority"));
+ EXPECT_TRUE(section.Exists("Build-Depends"));
+ EXPECT_EQ("debhelper,\n apt", section.FindS("Build-Depends"));
+
+ EXPECT_TRUE(tfile.Step(section));
+ EXPECT_FALSE(section.Exists("Source"));
+ EXPECT_TRUE(section.Exists("Package"));
+ EXPECT_EQ("foo", section.FindS("Package"));
+ EXPECT_FALSE(section.Exists("Section"));
+ EXPECT_TRUE(section.Exists("Architecture"));
+ EXPECT_EQ("any", section.FindS("Architecture"));
+ EXPECT_FALSE(section.Exists("Build-Depends"));
+ EXPECT_TRUE(section.Exists("Description"));
+ EXPECT_EQ("An awesome package\n # This should still appear in the result.\n Blah, blah, blah. # but this again.", section.FindS("Description"));
+
+ EXPECT_FALSE(tfile.Step(section));
+}