Erlang Central

Difference between revisions of "String Sorting (Natural)"

From ErlangCentral Wiki

(Created)
 
m (Added a link to the Wikipedia page about Schwartzian Transform.)
Line 52: Line 52:
 
</code>
 
</code>
  
Let's brake it down: the sort/1 function first maps the list, appending keys. It then sorts the list based on the keys and later maps them away to present the result, the sorted list.
+
Let's brake it down: the sort/1 function first maps the list, appending keys. It then sorts the list based on the keys and later maps them away to present the result, the sorted list. This is called a [http://en.wikipedia.org/wiki/Schwartzian_transform Schwartzian Transform].
  
 
So the trick is in how the keys are created. To be able to compare strings with integers in them, they are broken down like this:
 
So the trick is in how the keys are created. To be able to compare strings with integers in them, they are broken down like this:

Revision as of 08:57, 20 February 2008


Contents

Problem

Your strings are sorted in a non-natural way, i.e. not the way your users would expect. Example:

z100
z1000
z2
z20

You'd like to have them sorted in a natural way:

z2
z20
z100
z1000


Solution

Using a function that treats numbers in strings as real integers when sorting, solves the problem (note that this code does not take floats into account).

sort(List) ->
    lists:map(fun({_, K}) -> K end,
	      lists:keysort(1, lists:map(fun alphanum_key/1, List))).

alphanum_key(Term) when is_list(Term) ->
    {alphanum_key(Term, term, []), Term};
alphanum_key(Term) ->
    {Term, Term}.

alphanum_key([], integer, [H|List]) ->
    lists:reverse([list_to_integer(H)|List]);
alphanum_key([], term, List) ->
    lists:reverse(List);
alphanum_key([C|Term], _Type, []) when is_integer(C), C >= 48, C =< 57 ->
    alphanum_key(Term, integer, [[C]]);
alphanum_key([C|Term], _Type, []) ->
    alphanum_key(Term, term, [[C]]);
alphanum_key([C|Term], integer, [H|List]) when is_integer(C), C >= 48, C =< 57 ->
    alphanum_key(Term, integer, [H ++ [C]|List]);
alphanum_key([C|Term], integer, [H|List]) ->
    alphanum_key(Term, term, [[C]] ++ [list_to_integer(H)|List]);
alphanum_key([C|Term], term, [H|List]) when is_integer(C), C >= 48, C =< 57 ->
    alphanum_key(Term, integer, [[C]] ++ [H|List]);
alphanum_key([C|Term], term, [H|List]) ->
    alphanum_key(Term, term, [H ++ [C]|List]).

Let's brake it down: the sort/1 function first maps the list, appending keys. It then sorts the list based on the keys and later maps them away to present the result, the sorted list. This is called a Schwartzian Transform.

So the trick is in how the keys are created. To be able to compare strings with integers in them, they are broken down like this:

1> nsort:alphanum_key("1 snel hest, 2 arg apa", integer, []).
[1," snel hest, ",2," arg apa"]

The list is then compared element by element, thus rendering e.g. 20 larger 1000.

The keys are generated by parsing the string, finding numbers and gather them up to form a complete number (i.e. no letters inbetween), converting that to an Erlang integer

The function alphanum_key/3 parses a string from its first argument, remembers the type of the last parsed character in its second argument, and puts the result in the third argument, the accumulator.

When the keys are generated they are put together as keys with the original lists and those tuple pairs are sorted with lists:keysort/2.

The users are now happy!

2> nsort:sort(["pic129", "pic21", "pic02", "pic1", "pic350", "pic1001"]).
["pic1","pic02","pic21","pic129","pic350","pic1001"]


Authors

Adam Lindberg works as a consultant for Erlang Training & Consulting.

The code has no copyright or license. It is public domain. It should be possible to optimize it too, but it's way faster than the regexp version at least.


Full Source

-module(nsort).

-export([sort/1]).

-ifdef(EUNIT).
-include("nsort_tests.hrl").
-endif.

%%%----------------------------------------------------------------------------
%%% @spec sort(List) -> list()
%%%     List = list()
%%%
%%% @doc Sorts a list of strings with integers in a natural way.
%%% @end
%%%----------------------------------------------------------------------------
sort(List) ->
    lists:map(fun({_, K}) -> K end,
	      lists:keysort(1, lists:map(fun alphanum_key/1, List))).

alphanum_key(Term) when is_list(Term) ->
    {alphanum_key(Term, term, []), Term};
alphanum_key(Term) ->
    {Term, Term}.


alphanum_key([], integer, [H|List]) ->
    lists:reverse([list_to_integer(H)|List]);
alphanum_key([], term, List) ->
    lists:reverse(List);
alphanum_key([C|Term], _Type, [])
  when is_integer(C), C >= 48, C =< 57 ->
    alphanum_key(Term, integer, [[C]]);
alphanum_key([C|Term], _Type, []) ->
    alphanum_key(Term, term, [[C]]);
alphanum_key([C|Term], integer, [H|List])
  when is_integer(C), C >= 48, C =< 57 ->
    alphanum_key(Term, integer, [H ++ [C]|List]);
alphanum_key([C|Term], integer, [H|List]) ->
    alphanum_key(Term, term, [[C]] ++ [list_to_integer(H)|List]);
alphanum_key([C|Term], term, [H|List])
  when is_integer(C), C >= 48, C =< 57 ->
    alphanum_key(Term, integer, [[C]] ++ [H|List]);
alphanum_key([C|Term], term, [H|List]) ->
    alphanum_key(Term, term, [H ++ [C]|List]).


-include_lib("eunit.hrl").

-compile(export_all).

useful_test_() ->
    {inparallel,

     [{"Empty list",
       ?_assertMatch([],
		     nsort:sort([]))},
      
      {"List with single character strings containing no digits",
       ?_assertMatch(["a","b","c","d","f"],
		     nsort:sort(["d", "a", "f", "c", "b"]))},
      
      {"List with strings containing no digits",
       ?_assertMatch(["apa","hund","katt","krokodil","panda"],
		    nsort:sort(["hund","panda","krokodil","apa","katt"]))},
      
      {"List with strings containing digits",
       ?_assertMatch(["z1", "z10", "z20", "z50", "z100", "z500", "z1000",
		      "z1001"],
		     nsort:sort(["z20", "z1001", "z1", "z100", "z50", "z500",
				 "z10", "z1000"]))},

      {"Using official test strings from Dave Koelle",
       ?_assert(["10X Radonius",
		 "20X Radonius",
		 "20X Radonius Prime",
		 "30X Radonius",
		 "40X Radonius",
		 "200X Radonius",
		 "1000X Radonius Maximus",
		 "Allegia 50 Clasteron",
		 "Allegia 51 Clasteron",
		 "Allegia 51B Clasteron",
		 "Allegia 52 Clasteron",
		 "Allegia 60 Clasteron",
		 "Allegia 500 Clasteron",
		 "Alpha 2",
		 "Alpha 2A",
		 "Alpha 2A-900",
		 "Alpha 2A-8000",
		 "Alpha 100",
		 "Alpha 200",
		 "Callisto Morphamax",
		 "Callisto Morphamax 500",
		 "Callisto Morphamax 600",
		 "Callisto Morphamax 700",
		 "Callisto Morphamax 5000",
		 "Callisto Morphamax 7000",
		 "Callisto Morphamax 7000 SE",
		 "Callisto Morphamax 7000 SE2",
		 "QRS-60 Intrinsia Machine",
		 "QRS-60F Intrinsia Machine",
		 "QRS-62 Intrinsia Machine",
		 "QRS-62F Intrinsia Machine",
		 "Xiph Xlater 5",
		 "Xiph Xlater 40",
		 "Xiph Xlater 50",
		 "Xiph Xlater 58",
		 "Xiph Xlater 300",
		 "Xiph Xlater 500",
		 "Xiph Xlater 2000",
		 "Xiph Xlater 5000",
		 "Xiph Xlater 10000"] =:=
		nsort:sort(["1000X Radonius Maximus",
			    "10X Radonius",
			    "200X Radonius",
			    "20X Radonius",
			    "20X Radonius Prime",
			    "30X Radonius",
			    "40X Radonius",
			    "Allegia 50 Clasteron",
			    "Allegia 500 Clasteron",
			    "Allegia 51 Clasteron",
			    "Allegia 51B Clasteron",
			    "Allegia 52 Clasteron",
			    "Allegia 60 Clasteron",
			    "Alpha 100",
			    "Alpha 2",
			    "Alpha 200",
			    "Alpha 2A",
			    "Alpha 2A-8000",
			    "Alpha 2A-900",
			    "Callisto Morphamax",
			    "Callisto Morphamax 500",
			    "Callisto Morphamax 5000",
			    "Callisto Morphamax 600",
			    "Callisto Morphamax 700",
			    "Callisto Morphamax 7000",
			    "Callisto Morphamax 7000 SE",
			    "Callisto Morphamax 7000 SE2",
			    "QRS-60 Intrinsia Machine",
			    "QRS-60F Intrinsia Machine",
			    "QRS-62 Intrinsia Machine",
			    "QRS-62F Intrinsia Machine",
			    "Xiph Xlater 10000",
			    "Xiph Xlater 2000",
			    "Xiph Xlater 300",
			    "Xiph Xlater 40",
			    "Xiph Xlater 5",
			    "Xiph Xlater 50",
			    "Xiph Xlater 500",
			    "Xiph Xlater 5000",
			    "Xiph Xlater 58" ]))}]}.

wierd_test_() ->
    {inparallel, 
     
     [{"Single element list",
       ?_assertMatch([1],
		     nsort:sort([1]))},
      
      {"List with single digit integers",
       ?_assertMatch([1,2],
		     nsort:sort([2,1]))},
      
      {"List with two digit elements",
       ?_assertMatch([1,10,20,50,100,500,1000,1001],
		     nsort:sort([1001,10,1,50,100,20,1000,500]))},

      {"List with atoms",
       ?_assertMatch([a, b, c, d, e, f],
		     nsort:sort([b, e, c, f, a, d]))},
      
      {"List with tuples",
       ?_assertMatch([{10}, {a}, {b,1}, {c,0}, {"t20", 2000}],
		     nsort:sort([{c,0}, {b,1}, {"t20", 2000}, {10}, {a}]))},

      {"List with lists which are not strings",
       ?_assertMatch([[a], [b], [c], [d], [e]],
		     nsort:sort([[b], [d], [a], [e], [c]]))}]}.

cover_test_() ->
    {setup, fun cover:start/0, fun cover_reset/0,
     useful_test_()}.

cover_reset() ->
    cover:reset(nsort).