akregator/src/librss

tools_p.cpp

00001 /*
00002  * tools_p.cpp
00003  *
00004  * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
00005  *
00006  * This program is distributed in the hope that it will be useful, but WITHOUT
00007  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00008  * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
00009  * accompanying file 'COPYING'.
00010  */
00011 #include "tools_p.h"
00012 
00013 #include <krfcdate.h>
00014 #include <qdom.h>
00015 #include <kcharsets.h>
00016 #include <qregexp.h>
00017 
00018 namespace RSS {
00019 
00020 time_t parseISO8601Date(const QString &s)
00021 {
00022     // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo
00023     if (s.stripWhiteSpace().left(4).toInt() < 1000)
00024         return 0; // error
00025 
00026     // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo
00027     if (s.find('T') != -1)
00028         return KRFCDate::parseDateISO8601(s);
00029     else
00030         return KRFCDate::parseDateISO8601(s + "T12:00:00");
00031 }
00032 
00033 QString childNodesAsXML(const QDomNode& parent)
00034 {
00035     QDomNodeList list = parent.childNodes();
00036     QString str;
00037     QTextStream ts( &str, IO_WriteOnly );
00038     for (uint i = 0; i < list.count(); ++i)
00039         ts << list.item(i);
00040     return str.stripWhiteSpace();
00041 }
00042 
00043 static QString plainTextToHtml(const QString& plainText)
00044 {
00045     QString str(plainText);
00046     str.replace("&", "&amp;");
00047     str.replace("\"", "&quot;");
00048     str.replace("<", "&lt;");
00049     //str.replace(">", "&gt;");
00050     str.replace("\n", "<br/>");
00051     return str;
00052 }
00053 
00054 enum ContentFormat { Text, HTML, XML, Binary };
00055         
00056 static ContentFormat mapTypeToFormat(const QString& modep, const QString& typep,  const QString& src)
00057 {
00058     QString mode = modep.isNull() ? "escaped" : modep;
00059     QString type = typep;
00060     
00061     //"If neither the type attribute nor the src attribute is provided,
00062     //Atom Processors MUST behave as though the type attribute were
00063     //present with a value of "text""
00064     if (type.isNull() && src.isEmpty())
00065         type = QString::fromUtf8("text");
00066 
00067     if (type == QString::fromUtf8("html")
00068         || type == QString::fromUtf8("text/html"))
00069         return HTML;
00070     
00071     if (type == QString::fromUtf8("text")
00072         || (type.startsWith(QString::fromUtf8("text/"), false)
00073         && !type.startsWith(QString::fromUtf8("text/xml"), false))
00074        )
00075         return Text;
00076     
00077     QStringList xmltypes;
00078     xmltypes.append(QString::fromUtf8("xhtml"));
00079     // XML media types as defined in RFC3023:
00080     xmltypes.append(QString::fromUtf8("text/xml"));
00081     xmltypes.append(QString::fromUtf8("application/xml"));
00082     xmltypes.append(QString::fromUtf8("text/xml-external-parsed-entity"));
00083     xmltypes.append(QString::fromUtf8("application/xml-external-parsed-entity"));
00084     xmltypes.append(QString::fromUtf8("application/xml-dtd"));
00085     
00086     
00087     if (xmltypes.contains(type)
00088         || type.endsWith(QString::fromUtf8("+xml"), false)
00089         || type.endsWith(QString::fromUtf8("/xml"), false))
00090         return XML;
00091     
00092     return Binary;
00093 }
00094 
00095 static QString extractAtomContent(const QDomElement& e)
00096 {
00097     ContentFormat format = mapTypeToFormat(e.attribute("mode"),
00098                                            e.attribute("type"),
00099                                            e.attribute("src"));
00100     
00101     switch (format)
00102     {
00103         case HTML:
00104         {
00105             const bool hasPre = e.text().contains( "<pre>", false ) || e.text().contains( "<pre ", false );
00106             return KCharsets::resolveEntities( hasPre ? e.text() : e.text().simplifyWhiteSpace() );
00107         }
00108         case Text:
00109             return plainTextToHtml(e.text().stripWhiteSpace());
00110         case XML:
00111             return childNodesAsXML(e).simplifyWhiteSpace();
00112         case Binary:
00113         default:
00114             return QString();
00115     }
00116     
00117     return QString();
00118 }
00119 
00120 QString extractNode(const QDomNode &parent, const QString &elemName, bool isInlined)
00121 {
00122     QDomNode node = parent.namedItem(elemName);
00123     if (node.isNull())
00124         return QString::null;
00125 
00126     QDomElement e = node.toElement();
00127         QString result = e.text().stripWhiteSpace(); // let's assume plain text
00128  
00129         if (elemName == "content") // we have Atom here
00130         {
00131             result = extractAtomContent(e);
00132         }        
00133         else // check for HTML; not necessary for Atom:content
00134         {
00135             bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false);
00136             bool hasHtml = hasPre || result.contains("<");  // FIXME: test if we have html, should be more clever -> regexp
00137             if(!isInlined && !hasHtml)                      // perform nl2br if not a inline elt and it has no html elts
00138                     result = result = result.replace(QChar('\n'), "<br />");
00139             if(!hasPre)                                     // strip white spaces if no <pre>
00140                     result = result.simplifyWhiteSpace();
00141         }
00142         
00143         return result.isEmpty() ? QString::null : result;
00144 }
00145 
00146 QString extractTitle(const QDomNode & parent)
00147 {
00148     QDomNode node = parent.namedItem(QString::fromLatin1("title"));
00149     if (node.isNull())
00150         return QString::null;
00151 
00152     QString result = node.toElement().text();
00153 
00154     result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(QRegExp("<[^>]*>"), "").remove("\\"));
00155     result = result.simplifyWhiteSpace();
00156 
00157     if (result.isEmpty())
00158         return QString::null;
00159 
00160     return result;
00161 }
00162 
00163 static void authorFromString(const QString& strp, QString& name, QString& email)
00164 {
00165     QString str = strp.stripWhiteSpace();
00166     if (str.isEmpty())
00167         return;
00168     
00169     // look for something looking like a mail address ( "foo@bar.com", 
00170     // "<foo@bar.com>") and extract it
00171     
00172     QRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp,
00173        // search kmail source for it
00174     
00175     int pos = remail.search(str);
00176     if (pos != -1)
00177     {
00178         QString all = remail.cap(0);
00179         email = remail.cap(1);
00180         str.replace(all, ""); // remove mail address
00181     }
00182     
00183     // simplify the rest and use it as name
00184     
00185     name = str.simplifyWhiteSpace();
00186     
00187     // after removing the email, str might have 
00188     // the format "(Foo M. Bar)". We cut off 
00189     // parentheses if there are any. However, if
00190     // str is of the format "Foo M. Bar (President)",
00191     // we should not cut anything.
00192 
00193     QRegExp rename("^\\(([^\\)]*)\\)");
00194     
00195     pos = rename.search(name);
00196     
00197     if (pos != -1)
00198     {
00199         name = rename.cap(1);
00200     }
00201     
00202     name = name.isEmpty() ? QString() : name;
00203     email = email.isEmpty() ? QString() : email;
00204 }
00205 
00206 QString parseItemAuthor(const QDomElement& element, Format format, Version version)
00207 {
00208     QString name;
00209     QString email;
00210 
00211     QDomElement dcCreator = element.namedItem("dc:creator").toElement();
00212     
00213     if (!dcCreator.isNull())
00214          authorFromString(dcCreator.text(), name, email);
00215     else if (format == AtomFeed)
00216     {
00217         QDomElement atomAuthor = element.namedItem("author").toElement();
00218         if (atomAuthor.isNull())
00219             atomAuthor = element.namedItem("atom:author").toElement();
00220         if (!atomAuthor.isNull())
00221         {
00222             QDomElement atomName = atomAuthor.namedItem("name").toElement();
00223             if (atomName.isNull())
00224                 atomName = atomAuthor.namedItem("atom:name").toElement();
00225             name = atomName.text().stripWhiteSpace();
00226             
00227             QDomElement atomEmail = atomAuthor.namedItem("email").toElement();
00228             if (atomEmail.isNull())
00229                 atomEmail = atomAuthor.namedItem("atom:email").toElement();
00230             email = atomEmail.text().stripWhiteSpace();
00231         }
00232     }
00233     else if (format == RSSFeed)
00234     {
00235         authorFromString(element.namedItem("author").toElement().text(), name, email);
00236     }
00237     
00238     if (name.isNull())
00239         name = email;
00240     
00241     if (!email.isNull())
00242         return QString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name);
00243     else
00244         return name;
00245 }
00246 
00247 } // namespace RSS
00248 
00249 // vim:noet:ts=4
KDE Home | KDE Accessibility Home | Description of Access Keys