Logo Search packages:      
Sourcecode: kmess version File versions  Download package

void RichTextParser::parseMsnString ( QString &  text,
bool  showEmoticons = true,
bool  showSmallEmoticons = true,
bool  showLinks = true,
bool  showFormatting = false,
bool  allowEmoticonLinks = true,
const QString &  handle = *((QString*)0),
QStringList &  pendingEmoticonTags = *((QStringList*)0) 
) [static]

Transform a string into its rich text form

This is the one-stop shop for text parsing. This method is capable of single-pass conversion of many things in their rich text equivalents:

  • web links are made clickable, even geek-style "kmess.org" links and email addresses
  • emoticons shortcuts are transformed in HTML image tags, even custom ones, and not yet received ones
  • MSN Plus formatting is turned into HTML formatting

Parameters:
text The string to parse
showEmoticons Whether to show MSN emoticons in the parsed string
showSmallEmoticons Whether the emoticons should be full-size or small
showLinks Whether to enable clickable links and email addresses
showFormatting Whether to show or strip away MSN+ formatting tags
allowEmoticonLinks If false, the parser will never add links for adding an emoticon (such as for the chat history dialog)
handle If not null or empty, custom emoticons of this contact will be parsed
pendingEmoticonTags If the handle is specified, this must be too: this is a list of pending custom emoticons which the contact specified by handle has sent to us.

Definition at line 501 of file richtextparser.cpp.

References EmoticonManager::emoticonIsAdded(), CurrentAccount::getContactByHandle(), ContactBase::getEmoticonBlackList(), ContactBase::getEmoticonHashes(), ContactBase::getEmoticonPattern(), ContactBase::getEmoticonReplacements(), EmoticonManager::getHtmlPattern(), EmoticonManager::getHtmlReplacements(), ContactBase::getPendingEmoticonPattern(), KMessShared::htmlEscape(), and parseMsnPlusString().

Referenced by KMessViewDelegate::paint().

{
  // Remove all HTML
  KMessShared::htmlEscape( text );

  bool                         allowAddingEmoticons = false;

  // Build a collection of all emoticon data
  const QRegExp                &emoticonRegExp       = emoticonManager_->getHtmlPattern();
  const QHash<QString,QString> &emoticonReplacements = emoticonManager_->getHtmlReplacements( showSmallEmoticons );


  QRegExp customRegExp;
  QRegExp pendingRegExp;
  QHash<QString,QString> customReplacements;
  QHash<QString,QString> customHashes;
  QStringList customEmoticonsBlacklist;

  // Get theme of custom emoticons
  if( &handle != 0 && ! handle.isEmpty() )
  {
    // Avoid problems if no list of pending emoticons has been given
    if( &pendingEmoticonTags == 0 )
    {
      kWarning() << "The given pending emoticons list is not valid!";
      pendingEmoticonTags = QStringList();
    }

    if( handle == CurrentAccount::instance()->getHandle() )
    {
      customRegExp       = emoticonManager_->getHtmlPattern( true );
      customReplacements = emoticonManager_->getHtmlReplacements( showSmallEmoticons, true );
      // We already have all of our emoticons, there are no pending ones
    }
    else
    {
      const ContactBase *contact = CurrentAccount::instance()->getContactByHandle( handle );
      if( contact != 0 )
      {
        customRegExp       = contact->getEmoticonPattern();
        customReplacements = contact->getEmoticonReplacements();
        customHashes       = contact->getEmoticonHashes();
        pendingRegExp      = contact->getPendingEmoticonPattern();
        customEmoticonsBlacklist = contact->getEmoticonBlackList();

        allowAddingEmoticons = allowEmoticonLinks;
      }
    }
  }


  // TODO: place these regexps at the beginning of this file and
  // initialize them *once*!
  QRegExp linkRegExp;
  linkRegExp.setPattern( "\\b((?:http://|https://|ftp://|sftp://|www\\.)"
                         "\\S+)" 
                              // match protocol string followed by the host/path
                         "[.,;!?]?(?:&lt;|\\s|$)"
                              // ending with <, \s or $, not counting .,;?!"' before
                              // (there are some more modifications to a matched
                              // URL below)
                       );
  linkRegExp.setMinimal(1);

  QRegExp emailRegExp;
  emailRegExp.setPattern(
                          "\\b("                   // begin of word, start capture
                          "[a-zA-Z0-9_\\-\\.+]+"       // match email username
                          "\\@"                    // match '@'
                          "[a-zA-Z0-9\\-\\.]+"        // match domain hostname
                          "\\.[a-zA-Z0-9]{2,6}"       // match top-level-domain
                          ")"                      // end capture`
                          "(?:[^a-zA-Z0-9\\-]|$)"  // not followed by more simple characters, or should find an end-of-line
                        );

  QRegExp geekLinkRegExp;
  geekLinkRegExp.setPattern(
                             "(?:^|\\b)"                // look-before test, for start of capture or word delimiter
                             "("                        // begin of word, start capture
                             "([a-zA-Z0-9\\-]+\\.)+"        // match simple characters, but it should contain a dot between each part
                             "([a-zA-Z]{2,3})"             // finally match domain part 2 or 3 characters
                             "(/[a-zA-Z0-9\\-_/\\.?=&]+)?" // match the path on the server and simple query requests
                             ")"                        // end capture
                             "(?:[^a-zA-Z0-9]|$)"       // not followed by more simple characters, or should find an end-of-line
                           );

  // these can also be initialized *once*!
  QRegExp longWordsRegExp( QString( "([\\w\\d-_\\.]{%1})([\\w\\d-_\\.]+)" ).arg( WORDWRAP_EVERY ) );
  QStringList invalidCcTldList, topLevelDomainList;
  invalidCcTldList << "js" << "hh" << "cc" << "ui" << "fo" << "so"
   << "ko" << "qt" << "pp" << "cf" << "am" << "in" << "gz" << "ps"
   << "ai" << "rv" << "rm" << "wm" << "xd";
    // block typical files instead of listing the whole country code list
  topLevelDomainList << "com" << "org" << "net" << "edu" << "gov";

  // removing the need to test these every time:
#ifdef KMESSTEST
  KMESS_ASSERT( emoticonRegExp.isValid() );
  KMESS_ASSERT( emailRegExp.isValid() );
  KMESS_ASSERT( linkRegExp.isValid() );
  KMESS_ASSERT( geekLinkRegExp.isValid() );
  KMESS_ASSERT( longWordsRegExp.isValid() );
#endif

  // Set the filename of the placeholder image for pending emoticons
  static QString pendingEmoticonPlaceholder(
      Qt::escape( KGlobal::dirs()->findResource( "appdata", "pics/empty.png" ) ) );
  
  // Set up the emoticon replacement list
  QHash<QString,QString> emoticonReplacementList;
  QHash<QString,QString>::const_iterator ei;
  // first write emoticonReplacements, then overwrite with customReplacements.
  // We can't do pendingEmoticons right now, because we don't have a QHash of them.
  // This is considered TODO.
  ei = emoticonReplacements.constBegin();
  while( ei != emoticonReplacements.constEnd())
  {
    KMESS_ASSERT( ! ei.value().isEmpty() );
    emoticonReplacementList.insert( ei.key(), "<span>" + ei.value() + "</span>" );
    ++ei;
  }
  ei = customReplacements.constBegin();
  while( ei != customReplacements.constEnd())
  {
    KMESS_ASSERT( ! ei.value().isEmpty() );

    if( customEmoticonsBlacklist.contains( ei.key() ) )
    {
#ifdef KMESSDEBUG_RICHTEXTPARSER
      kDebug() << "Ignoring blacklisted emoticon" << ei.key();
#endif
      ++ei;
      continue;
    }

    if( allowAddingEmoticons && ! emoticonManager_->emoticonIsAdded( customHashes.value( ei.key() ) ) )
    {
#ifdef KMESSDEBUG_RICHTEXTPARSER
      kDebug() << "Inserting emoticon additional link for " << ei.key() << " with hash " << customHashes.value( ei.key() );
#endif
      QString imagePath( ei.value() );
      QString urlCode( QUrl::toPercentEncoding( ei.key() ) );

      // Retrieve the image name from the replacement
      // TODO: Change the ' to \", and optimize/cache the result (somehow!)
      imagePath = imagePath.replace( "\"", "'");
      imagePath = imagePath.mid(  imagePath.indexOf( "src='" ) + 5 );
      imagePath = imagePath.left( imagePath.indexOf( "'" ) );

      // i18n() will unescape the string: Without this, an emoticon like " 'test " will
      // result in an attribute like this: " title='Add this emoticon: 'test' ", messing
      // up the whole markup
      QString escapedCode( ei.key() );
      KMessShared::htmlEscape( escapedCode );

      emoticonReplacementList.insert( ei.key(),
            "<a name='newEmoticon_" + urlCode + "' title='" +
            i18n( "Add this emoticon: %1", escapedCode )
            + "' href='kmess://emoticon/" + handle + "/" + urlCode + "/"
            + QUrl::toPercentEncoding( imagePath ) + "'>"
            + ei.value() + "</a>" );
    }
    else
    {
#ifdef KMESSDEBUG_RICHTEXTPARSER
      kDebug() << "Not inserting emoticon additional link for " << ei.key() << ". allowAddingEmoticons=" << allowAddingEmoticons
        << "; customHashes.value( ei.key() ) = " << customHashes.value( ei.key() ) << "; emoticonIsAdded=" << emoticonManager_->emoticonIsAdded( customHashes.value( ei.key() ) );
#endif
      emoticonReplacementList.insert( ei.key(), ei.value() );
    }
    ++ei;
  }
  // TODO: also do pending emoticons here

  // set up all regexps
  static const int REGEXP_COUNT = 7;
  const QRegExp* regexps[REGEXP_COUNT];

  regexps[0] = ( showLinks ? &linkRegExp     : 0 );
  regexps[1] = ( showLinks ? &emailRegExp    : 0 );
  regexps[2] = ( ( showEmoticons && ! customRegExp  .isEmpty() ) ? &customRegExp   : 0 );
  regexps[3] = ( ( showEmoticons && ! pendingRegExp .isEmpty() ) ? &pendingRegExp  : 0 );
  regexps[4] = ( showLinks ? &geekLinkRegExp : 0 );
  regexps[5] = ( ( showEmoticons && ! emoticonRegExp.isEmpty() ) ? &emoticonRegExp : 0 );
  regexps[6] = ( &longWordsRegExp );

  // We apply the regexps in order, and each time, we take the matched part out of the string and re-add it
  // to the QStringList. Every piece of already-parsed data is HTML and starts with <, so we know what strings
  // not to parse.
  QString parseString, matched, replacement, tld, placeholderId, link, wordWrapLink;
  QStringList output( text );
#if QT_VERSION >= 0x040500
  const QRegExp *regexp;
#else
  QRegExp *regexp;
#endif
  int index;

  for( int i = 0; i < REGEXP_COUNT; ++i )
  {
#if QT_VERSION >= 0x040500
    regexp = regexps[i];
#else
    regexp = const_cast<QRegExp*>( regexps[i] );   // QRegExp->cap() is not const in Qt 4.4
#endif
    if( regexp == 0 )
    {
      continue;
    }

    for( int j = 0; j < output.size(); ++j )
    {
      parseString = output.at( j );
#ifdef KMESSDEBUG_RICHTEXTPARSER
      kDebug() << "applying regexp" << i << "to string" << parseString;
#endif

      // Don't parse this piece, it's HTML
      if( parseString.startsWith( "<" ) )
      {
        continue;
      }
      else if( parseString.isEmpty() )
      {
        continue;
      }

      index = regexp->indexIn( parseString );

      if( index != -1 )
      {
#ifdef KMESSDEBUG_RICHTEXTPARSER
        kDebug() << "Regexp" << i << "matches at character" << index << "in string" << j << ": " << parseString;
#endif

        // Match, turn this piece into HTML (split string up into 3 strings)
        switch( i )
        {
          // Normal link
          case 0:
            matched = regexp->cap( 1 );

            // Some link normalizing: only allow ')' at the end if there's '(', same with ", etc
            if( matched.endsWith(")") && ! matched.contains("(") )
            {
              matched.chop( 1 );
            }
            if( matched.endsWith("&gt;") && ! matched.contains("&lt;") )
            {
              matched.chop( 4 );
            }
            if( matched.endsWith("}") && ! matched.contains("{") )
            {
              matched.chop( 1 );
            }
            if( matched.endsWith   ( "&#34;" )
            &&  matched.lastIndexOf( "&#34;", -6 ) == -1 )
            {
              matched.chop( 5 );
            }
            if( matched.endsWith   ( "&#39;" )
            &&  matched.lastIndexOf( "&#39;", -6 ) == -1 )
            {
              matched.chop( 5 );
            }

            // pre-wordwrap the link: make sure it word wraps nicely in KHTML :)
            wordWrapLink = QString();
            for( int k = 0; k < matched.length(); k += WORDWRAP_EVERY )
            {
              int nextSemi = matched.indexOf( ";", k + WORDWRAP_EVERY );
              if( nextSemi - (k + WORDWRAP_EVERY) >= 0 && nextSemi - (k + WORDWRAP_EVERY) <= 5 )
              {
                // There's a ; just after this text piece, so this piece may end just inside a HTML entity.
                // Therefore, we can't simply add <wbr/> here, we need to insert it just before
                // the &.
                int htmlEntityStarts = matched.lastIndexOf( "&", nextSemi );
                int htmlEntityLength = nextSemi - htmlEntityStarts;

                // all text before the entity, then a wbr, then the entity
                wordWrapLink.append( matched.mid( k, htmlEntityStarts - k ) + "<wbr/>" + matched.mid( htmlEntityStarts, htmlEntityLength ) );

                // then increase the current position just a little so the next run starts after the entity
                k += nextSemi - ( k + WORDWRAP_EVERY );
              } else {
                // otherwise, just append.
                wordWrapLink.append( matched.mid( k, WORDWRAP_EVERY ) + "<wbr/>" );
              }
            }
            wordWrapLink.chop( 6 ); // chop off the last <wbr/>

            if( matched.startsWith( "www." ) )
            {
              replacement = "<a href=\"http://" + matched + "\" target=\"_blank\">" +
                            wordWrapLink + "</a>";
            }
            else
            {
              replacement = "<a href=\"" + matched + "\" target=\"_blank\">" +
                            wordWrapLink + "</a>";
            }
            break;


          // Email link
          case 1:
            matched = regexp->cap( 1 );
            replacement = "<a href=\"mailto:" + matched + "\">" + matched + "</a>";
            break;


          // Geek-style link
          case 4:
            matched = regexp->cap( 1 );
            tld     = geekLinkRegExp.cap( 3 );
            if( ( tld.length() == 2 && ! invalidCcTldList.contains( tld.toLower() ) )
            ||  ( tld.length() == 3 && topLevelDomainList.contains( tld.toLower() ) ) )
            {
              replacement = "<a href=\"http://" + matched + "/\" target=\"_blank\">" + matched + "</a>";
            }
            else
            {
              // Not a valid geeklink, don't replace it
              replacement = matched;
            }
            break;


          // Pending emoticon
          case 3:
            // For now, we will have to do pending emoticons seperately from custom
            // and normal ones. TODO: fix this.

            // Don't replace anything if this emoticon is blacklisted
            if( customEmoticonsBlacklist.contains( matched ) )
            {
              replacement = matched;
              break;
            }

            placeholderId = "ce" + QString::number( ++lastPendingEmoticonId_ );
            pendingEmoticonTags.append( placeholderId );

            // Insert placeholder
            matched = regexp->cap( 0 );
            replacement = "<img id='" + placeholderId +
                          "' src='" + pendingEmoticonPlaceholder + // This is already escaped
                          "' alt='" + matched +
                          "' contact='" + Qt::escape( handle ) +
                          "' width='16' height='16' valign='middle' "
                          "class='customEmoticonPlaceholder' />";
            break;


          // Custom emoticon: parsed like standard emoticons, to allow
          // overwriting a standard emoticon's shortcut with a custom one
          case 2:
          // Standard emoticon
          case 5:
            matched = regexp->cap( 0 );

            // note that a regexp match starting with a html entity (such as a custom emoticon "&)" or the 
            // MSN emoticon "('.')" ) will appear as for example '&amp;)' in the regexp; this will not collide
            // with the emoticon ;) because it appears in the regexp as &#59;). So no worries, no hacks, no fixes needed.

            if( ! emoticonReplacementList.contains( matched ) )
            {
              kWarning() << "Emoticon replacement list does not contain matched emoticon"
                         << matched;
              replacement = matched;
              break;
            }
            replacement = emoticonReplacementList.value( matched );
            break;


          // Long words
          case 6:
            matched = regexp->cap( 1 );
            replacement = matched + "<wbr/>";
            break;


          default:
            kWarning() << "Warning: Unhandled regexp";
            replacement.clear();
            break;
        }

        if( matched.isEmpty() )
        {
          kError() << "Zero-length regexp match in regexp" << i << "- string:" << parseString;
          kError() << "Regexp at this point: " << regexp->pattern();
#ifdef KMESSTEST
          KMESS_ASSERT( regexp->pattern() == regexps[i]->pattern() );
#endif
          // try to fix it by skipping this frame and leaving the rest the same...
          j++;
          continue;
        }

#ifdef KMESSDEBUG_RICHTEXTPARSER
        kDebug() << j     << ":" << parseString.left( index );
        kDebug() << j + 1 << ":" << replacement;
        kDebug() << j + 2 << ":" << parseString.mid( index + matched.length() );
#endif
        output.replace( j, parseString.left( index ) );
        output.insert(  j + 1, replacement);
        output.insert(  j + 2, parseString.mid( index + matched.length() ) );

        // Increment j here: it'll be also incremented by the loop.
        // This is because j+1 now is an html string and needs not to be parsed
        j++;

#ifdef KMESSDEBUG_RICHTEXTPARSER
        kDebug() << "j is now:" << j;
#endif
      }
    }
  }

  text = output.join( "" )
  // Replace any "> "s in the message with ">&nbsp;" to avoid missing spaces after emoticons
               .replace( "> ", ">&nbsp;" )
  // Replace double spaces with double &nbsp;s so that they'll show properly
               .replace( "  ", "&nbsp;&nbsp;" );

  // TODO: We don't want to remove MSN+ tags when showFormatting is disabled!!
  // Replace the MSN Plus text formatting tags
  if( showFormatting )
  {
    parseMsnPlusString( text );
    getFormattedString( text );
  }
  else
  {
    getCleanString( text );
  }
}


Generated by  Doxygen 1.6.0   Back to index